diff --git a/.arcconfig b/.arcconfig
--- a/.arcconfig
+++ b/.arcconfig
@@ -1,5 +1,5 @@
 {
-  "phabricator.uri" : "https://reviews.llvm.org/",
+
   "repository.callsign" : "G",
   "conduit_uri" : "https://reviews.llvm.org/"
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -194,6 +194,8 @@
 __OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32)
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, Int32, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32 )
+__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32)
 
 __OMP_RTL(omp_get_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -21,10 +21,12 @@
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include <algorithm>
 
 using namespace llvm;
 using namespace omp;
@@ -61,6 +63,25 @@
     OMPBuilder.initialize();
   }
 
+  /// Data structure to hold information for the  deleting
+  /// redundent OpenMP for loop calls
+  struct OMPLoopFusion {
+    bool check=false;
+    /// Keeps map of __kmpc_static_init4 and its __kmpc_static_fini calls for each OpenMP for loop
+    std::map<CallInst *, CallInst *> call_init_fini_mapping;
+    std::map<CallInst *, BasicBlock*> call_basicblock_mapping;
+    /// Keeps map of __kmpc_static_init4 and  all its compatilable __kmpc_static_init4 in a vector
+    std::map<CallInst *, std::vector<CallInst *>> call_map;
+    std::map<CallInst *, std::vector<Value *>> call_arg;
+    /// the data structure maintain the basic blocks in a lineage
+    std::map<BasicBlock*, std::vector<BasicBlock*>> chain;
+    std::vector<BasicBlock*> visited, loopVisited;
+    /// store_op0_op01 keeps map of operand 1 and operand 0
+    /// args_map keeps map of arguments of __kmpc_static_init4 for later cleaning
+    std::map<Value *, Value *> store_op0_op1, args_map;
+    CallInst *current_call_init_instruction = nullptr;
+  };
+
   /// Generic information that describes a runtime function
   struct RuntimeFunctionInfo {
     /// The kind, as described by the RuntimeFunction enum.
@@ -107,18 +128,306 @@
   /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
   bool run() {
     bool Changed = false;
-
     LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
                       << " functions in a slice with " << ModuleSlice.size()
                       << " functions\n");
 
     Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
+    Changed |= deleteStaticScheduleCalls();
 
     return Changed;
   }
 
 private:
+  /// Combine "OpenMP for loop with static scheduling"
+  /// check if all parameters are same and the loops are adjacent
+  /// See https://openmp.llvm.org/Reference.pdf. See section 5.8.3.24 for parameters
+  /// The two for loops can share the same __kmpc_static_init4() and __kmpc_static_fini()
+  /// calls.
+    
+  bool deleteStaticScheduleCalls() {
+    bool Changed = false;
+    // if there is no kmpc_for_static_init_4, there is  no need to do anything
+    RuntimeFunctionInfo &RFI = RFIs[OMPRTL___kmpc_for_static_init_4];
+    if (!RFI.Declaration)
+        return Changed;
+    // Else go through each function
+    OMPLoopFusion OLF;
+    for (Function *F : SCC)
+        Changed = runOverTheBlock(*F, &OLF);
+    return Changed;
+  }
+    
+// Check the compatility of the of the __kmpc_for_static_init_4
+    void checkTheCompatibility(OMPLoopFusion *OLF){
+        bool compatible = true;
+        for (auto itr : OLF->call_init_fini_mapping) {
+          if (find(itr.first, OLF->call_map)) continue;
+          std::vector<CallInst *> v;
+          std::vector<Value *> v1;
+          for (Value *arg : (itr.first)->args())
+                v1.push_back(arg);
+          for (auto itr1 : OLF->call_init_fini_mapping) {
+            if ((itr.first) == (itr1.first)) continue;
+            if (find(itr1.first, OLF->call_map)) continue;
+            std::vector<Value *> v2;
+            for (Value *arg2 : (itr1.first)->args())
+                v2.push_back(arg2);
+            for (auto i = v1.begin(), j = v2.begin(); i != v1.end() && j != v2.end(); ++i, ++j) {
+                if (isa<Constant>(*i) && isa<Constant>(*j)) {
+                  if (*i != *j) {compatible = false; break;}
+                }
+                else {
+                  if (OLF->store_op0_op1.find(*j)->second != OLF->store_op0_op1.find(*i)->second) {
+                    compatible = false; break;}
+                }
+              }
+            if (compatible) {
+                for (auto i = v1.begin(), j = v2.begin(); i != v1.end() && j != v2.end(); ++i, ++j) {
+                    OLF->args_map.insert({*j,*i});
+                }
+                v.push_back(itr1.first);
+              }
+              else break; /// the adjacent for omp loop is not compatible so there is no need to check others
+               ///  therefore we need to break out of the second for loop
+          }
+            /// if a call instruction has some compatible call instructions then put in the call_map container
+             OLF->call_map.insert({itr.first, v});
+            ///  make the flag  true again for the next  instruction checking
+            if (!compatible) compatible = true;
+            v.clear();
+        }
+    }
+    
+    bool checkForOMPInit(BasicBlock*  B){
+        if (!B) return false;
+        for (BasicBlock::iterator BBI=B->begin(); BBI !=B->end(); ++BBI){
+            if (CallInst *c= dyn_cast<CallInst>(BBI)){
+                if (c->getCalledFunction()->getName()=="__kmpc_for_static_init_4"){
+                    return true;}
+                }
+            }
+        return false;
+    }
+    
+    bool checkForOMPFini(BasicBlock*  B){
+        if  (!B) return false;
+        for (BasicBlock::iterator BBI=B->begin(); BBI !=B->end(); ++BBI){
+            if (CallInst *c= dyn_cast<CallInst>(BBI)){
+                if (c->getCalledFunction()->getName()=="__kmpc_for_static_fini"){
+                    return true;}
+                }
+            }
+        return false;
+    }
+    
+    void markNodeVisited(BasicBlock* B,std::vector <BasicBlock *> &v,OMPLoopFusion *OLF){
+        if (!B) return;
+        OLF->visited.push_back(B);
+        v.push_back(B);
+        for ( auto BB: successors(B)){
+            if (find(OLF->visited,BB)) continue;
+            markNodeVisited(BB,v, OLF);
+        }
+    }
+    
+    BasicBlock* checkTheLoop(BasicBlock* B,std::vector <BasicBlock *> &v, OMPLoopFusion *OLF){
+        std::vector<BasicBlock*> v2;
+        for (auto S: successors(B)){
+            if (checkLoop(S, B, v2)) {
+                // mark all the node as visited
+                markNodeVisited(S,v,OLF);
+                return  nullptr;}
+            else
+                return S;
+        }
+        return nullptr;
+    }
+    
+    bool checkLoop(BasicBlock* S, BasicBlock* B, std::vector<BasicBlock*>& visit){
+        bool loop = false;
+        if (!S) return loop;
+        for (auto BB: successors(S)){
+            if (BB == B) {loop = true; break;}
+            if (find(visit, BB)) continue;
+            visit.push_back(BB);
+            loop = (loop || checkLoop (BB, B, visit));
+        }
+        return loop;
+    }
+    
+    
+    int countSuccessors(BasicBlock* B){
+        int count = 0;
+        for (auto BS: successors(B)) // I should use iterator instead
+            count++;
+        return count;
+    }
+    int countPredessors(BasicBlock* B){
+        int count = 0;
+        for (auto BP: predecessors(B))
+            count++;
+        return count;
+    }
+    void makeLineage(BasicBlock *B, std::vector <BasicBlock *> &v, OMPLoopFusion *OLF){
+        if (!B or find(OLF->visited, B) ) return;
+        if ((countSuccessors(B) <=1 )   && (countPredessors(B) > 1)) return; // unique entrance with two control flows
+        if ((countPredessors(B) <=1 ) && (countSuccessors(B)) > 1) return; // two control flows merging into a unique point
+        // these points can not  be part  of lineage for the optimizations
+        BasicBlock* t=nullptr;
+        // If you  have a basic blokc try to find the omp for starting point
+        if (B->getSingleSuccessor()){
+            OLF->visited.push_back(B);
+            v.push_back(B);
+            if  (checkForOMPInit(B)) // if you find it then find the end points ; all inbetween points are are part of the lineage
+                t=checkOMPForLoop(B->getSingleSuccessor(), v, OLF);// the output is the basicblock for building the lineage
+            else
+                t=B->getSingleSuccessor();}// else take the  successor and move on
+        else {// if you have a codition with more than two successors and predecessors
+            // we need to check if they are control points or inbetween for loops
+                OLF->visited.push_back(B);
+                t = checkTheLoop(B, v, OLF) ;
+                v.push_back(B);
+        }
+        makeLineage(t, v, OLF);
+        return;
+    }
+
+    
+    BasicBlock* checkOMPForLoop(BasicBlock *BB,std::vector <BasicBlock *> &v, OMPLoopFusion *OLF){
+        BasicBlock * t = nullptr;
+        if (!BB) return t;
+        OLF->visited.push_back(BB);
+        v.push_back(BB);
+        for (auto B: successors(BB)){
+           if (find(OLF->visited, B)) continue;
+           if (checkForOMPFini(B)) { t= B; continue;}
+           checkOMPForLoop (B, v, OLF);
+        }
+        return t;
+    }
+   
+    
+    bool find(std::vector <BasicBlock*> b, BasicBlock* B){
+        for ( auto t: b)
+            if (t == B)  return true;
+        return false;
+    }
+    
+    bool find(CallInst *I, std::map<CallInst *, std::vector<CallInst *>> m) {
+        for (auto itr :m){
+            if (itr.first== I) return true;
+         for (auto itr1 : (itr.second))
+             if (I == itr1) return true;
+        }
+       return false;
+    }
+    
+    void clean_intrinsic_calls(BasicBlock* B, OMPLoopFusion *OLF){
+        std::vector<Instruction *> remove;
+        for (BasicBlock::iterator DI = B->begin(); DI != B->end(); ++DI ) {
+            if (IntrinsicInst *II = dyn_cast<IntrinsicInst> (DI)){
+                if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end ){
+                    remove.push_back(II);
+                    }
+                }
+            }
+        for (auto r: remove)
+            r->eraseFromParent();
+    }
+    
+    void check_call_instructions(BasicBlock* B, OMPLoopFusion *OLF){
+        for (BasicBlock::iterator DI = B->begin(); DI != B->end(); ++DI ) {
+            if (CallInst *c = dyn_cast<CallInst>(DI)) {
+              if (c->getCalledFunction()->getName() == "__kmpc_for_static_init_4")
+                OLF->current_call_init_instruction = c;
+              if (c->getCalledFunction()->getName() == "__kmpc_for_static_fini")
+                OLF->call_init_fini_mapping.insert({OLF->current_call_init_instruction, c});
+            }
+            if (StoreInst *store = dyn_cast<StoreInst>(DI))
+                OLF->store_op0_op1.insert({store->getOperand(1), store->getOperand(0)});
+          }
+    }
+                                 
+    bool runOverTheBlock(Function &F, OMPLoopFusion *OLF) {
+        std::vector <BasicBlock *> v;
+        bool changed = false;
+         for (auto &BB: F) {
+             // on each  block prepare data structure for the instructions
+             if (find (OLF->visited, &BB)) continue;
+             makeLineage (&BB, v, OLF);
+             OLF->chain.insert({&BB,v});
+             v.clear();
+         }
+        changed = doTheOptimization(OLF);// act  on the formed lineages
+        
+        return changed;
+    }
+     
+    bool doTheOptimization(OMPLoopFusion *OLF){
+        bool changed = false;
+        for (auto S: OLF->chain){
+            //we have todo it for each lineage
+            //B is a basic block in a lineage
+            for ( auto B:S.second){
+                check_call_instructions(B, OLF);
+            }
+            checkTheCompatibility(OLF);
+            changed = cleanInstructions(OLF);
+            if (changed)
+                for (auto B:S.second){
+                    replace_UseValues(B, OLF);
+                    clean_intrinsic_calls(B, OLF);
+                }
+            OLF->call_init_fini_mapping.clear();
+            OLF->call_map.clear();
+            OLF->store_op0_op1.clear();
+            OLF->args_map.clear();
+            
+        }
+        return changed;
+    }
+    
+    void replace_UseValues(BasicBlock* B, OMPLoopFusion *OLF){
+        std::vector<Instruction *> remove;
+        for (BasicBlock::iterator II = B->begin(); II != B->end(); ++II) {
+            Instruction *It = dyn_cast<Instruction>(II);
+            if (isa<CallInst>(It)) continue;
+            for (unsigned int k = 0; k < It->getNumOperands(); k++){
+                auto temp =  OLF->args_map.find(It->getOperand(k));
+                if (temp != OLF->args_map.end()){
+                    It->setOperand(k, temp->second);
+                    if (isa<StoreInst>(It) && k > 0) remove.push_back(It);
+                    }
+            }
+        }
+        for (auto r: remove)
+                r->eraseFromParent();
+    }
+    
+    bool cleanInstructions(OMPLoopFusion *OLF) {
+      bool changed = false;
+      for (auto itr : OLF->call_map) {
+        int count = (itr.second).size();
+        if (!count) continue;
+        Instruction *I = OLF->call_init_fini_mapping.find(itr.first)->second;
+        I->eraseFromParent();
+        changed = true;
+        for (auto itr1:itr.second) {
+          Instruction *I1 = itr1;
+          Instruction *I2 = OLF->call_init_fini_mapping.find(itr1)->second;
+          I1->eraseFromParent();
+          if (count == 1) break;
+          I2->eraseFromParent();
+          count--;
+          }
+        }
+      return changed;
+    }
+  
+
+
   /// Try to delete parallel regions if possible
   bool deleteParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;
diff --git a/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll b/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll
@@ -0,0 +1,909 @@
+; The IR is produced using << -fopenmp -emit-llvm -S -c  parallelMergeForLoop.c -o parallelMergeForLoop.ll >> flags
+; RUN: opt -S -attributor -openmpopt -O3  < %s  | FileCheck %s
+; ModuleID = 'parallelMergeForLoop.c'
+source_filename = "parallelMergeForLoop.c"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.15.0"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@.str = private unnamed_addr constant [3 x i8] c"%d\00", align 1
+
+; Function Attrs: noinline nounwind optnone ssp uwtable
+define void @merge_all() #0 {
+  %1 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %1)
+  ret void
+}
+;;;;; Test-1
+;void merge_all(){
+;        int a=0;
+;        #pragma omp parallel
+;        {
+;                #pragma omp for
+;                for (int i=0; i < 100; i++)
+;                        a=i;
+;
+;                #pragma omp for
+;                for (int j=0; j < 100; j++)
+;                        a=j;
+;        }
+;}
+;;; Both the loops should be merged
+
+
+
+; Function Attrs: noinline norecurse nounwind optnone ssp uwtable
+define internal void @.omp_outlined.(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 {
+  %4 = alloca i32*, align 8
+  %5 = alloca i32*, align 8
+  %6 = alloca i32*, align 8
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca i32, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca i32, align 4
+  %13 = alloca i32, align 4
+  %14 = alloca i32, align 4
+  %15 = alloca i32, align 4
+  %16 = alloca i32, align 4
+  %17 = alloca i32, align 4
+  %18 = alloca i32, align 4
+  %19 = alloca i32, align 4
+  %20 = alloca i32, align 4
+  store i32* %0, i32** %4, align 8
+  store i32* %1, i32** %5, align 8
+  store i32* %2, i32** %6, align 8
+  %21 = load i32*, i32** %6, align 8
+  store i32 0, i32* %9, align 4
+  store i32 99, i32* %10, align 4
+  store i32 1, i32* %11, align 4
+  store i32 0, i32* %12, align 4
+  %22 = load i32*, i32** %4, align 8
+  %23 = load i32, i32* %22, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1)
+  %24 = load i32, i32* %10, align 4
+  %25 = icmp sgt i32 %24, 99
+  br i1 %25, label %26, label %27
+
+26:                                               ; preds = %3
+  br label %29
+
+27:                                               ; preds = %3
+  %28 = load i32, i32* %10, align 4
+  br label %29
+
+29:                                               ; preds = %27, %26
+  %30 = phi i32 [ 99, %26 ], [ %28, %27 ]
+  store i32 %30, i32* %10, align 4
+  %31 = load i32, i32* %9, align 4
+  store i32 %31, i32* %7, align 4
+  br label %32
+
+32:                                               ; preds = %42, %29
+  %33 = load i32, i32* %7, align 4
+  %34 = load i32, i32* %10, align 4
+  %35 = icmp sle i32 %33, %34
+  br i1 %35, label %36, label %45
+
+36:                                               ; preds = %32
+  %37 = load i32, i32* %7, align 4
+  %38 = mul nsw i32 %37, 1
+  %39 = add nsw i32 0, %38
+  store i32 %39, i32* %13, align 4
+  %40 = load i32, i32* %13, align 4
+  store i32 %40, i32* %21, align 4
+  br label %41
+
+41:                                               ; preds = %36
+  br label %42
+
+42:                                               ; preds = %41
+  %43 = load i32, i32* %7, align 4
+  %44 = add nsw i32 %43, 1
+  store i32 %44, i32* %7, align 4
+  br label %32
+
+45:                                               ; preds = %32
+  br label %46
+
+46:                                               ; preds = %45
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23)
+  store i32 0, i32* %16, align 4
+  store i32 99, i32* %17, align 4
+  store i32 1, i32* %18, align 4
+  store i32 0, i32* %19, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1)
+  %47 = load i32, i32* %17, align 4
+  %48 = icmp sgt i32 %47, 99
+  br i1 %48, label %49, label %50
+
+49:                                               ; preds = %46
+  br label %52
+
+50:                                               ; preds = %46
+  %51 = load i32, i32* %17, align 4
+  br label %52
+
+52:                                               ; preds = %50, %49
+  %53 = phi i32 [ 99, %49 ], [ %51, %50 ]
+  store i32 %53, i32* %17, align 4
+  %54 = load i32, i32* %16, align 4
+  store i32 %54, i32* %14, align 4
+  br label %55
+
+55:                                               ; preds = %65, %52
+  %56 = load i32, i32* %14, align 4
+  %57 = load i32, i32* %17, align 4
+  %58 = icmp sle i32 %56, %57
+  br i1 %58, label %59, label %68
+
+59:                                               ; preds = %55
+  %60 = load i32, i32* %14, align 4
+  %61 = mul nsw i32 %60, 1
+  %62 = add nsw i32 0, %61
+  store i32 %62, i32* %20, align 4
+  %63 = load i32, i32* %20, align 4
+  store i32 %63, i32* %21, align 4
+  br label %64
+
+64:                                               ; preds = %59
+  br label %65
+
+65:                                               ; preds = %64
+  %66 = load i32, i32* %14, align 4
+  %67 = add nsw i32 %66, 1
+  store i32 %67, i32* %14, align 4
+  br label %55
+
+68:                                               ; preds = %55
+  br label %69
+
+69:                                               ; preds = %68
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23)
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: nounwind
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) #2
+
+; Function Attrs: convergent nounwind
+declare void @__kmpc_barrier(%struct.ident_t*, i32) #3
+
+; Function Attrs: nounwind
+declare !callback !4 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #2
+
+; Function Attrs: noinline nounwind optnone ssp uwtable
+define void @merge_none() #0 {
+  %1 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* %1)
+  ret void
+}
+
+;;;;; Test-2
+;void merge_none(){
+;        int a=0;
+;        #pragma omp parallel
+;        {
+;                #pragma omp for
+;                for (int i=1; i < 100; i++)
+;                        a=i;
+;                #pragma omp for
+;                for (int j=0; j < 100; j++)
+;                        a=j;
+;        }
+;}
+;;; The two OMP for loops should not be merged
+
+
+
+; Function Attrs: noinline norecurse nounwind optnone ssp uwtable
+define internal void @.omp_outlined..1(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 {
+  %4 = alloca i32*, align 8
+  %5 = alloca i32*, align 8
+  %6 = alloca i32*, align 8
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca i32, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca i32, align 4
+  %13 = alloca i32, align 4
+  %14 = alloca i32, align 4
+  %15 = alloca i32, align 4
+  %16 = alloca i32, align 4
+  %17 = alloca i32, align 4
+  %18 = alloca i32, align 4
+  %19 = alloca i32, align 4
+  %20 = alloca i32, align 4
+  store i32* %0, i32** %4, align 8
+  store i32* %1, i32** %5, align 8
+  store i32* %2, i32** %6, align 8
+  %21 = load i32*, i32** %6, align 8
+  store i32 0, i32* %9, align 4
+  store i32 98, i32* %10, align 4
+  store i32 1, i32* %11, align 4
+  store i32 0, i32* %12, align 4
+  %22 = load i32*, i32** %4, align 8
+  %23 = load i32, i32* %22, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1)
+  %24 = load i32, i32* %10, align 4
+  %25 = icmp sgt i32 %24, 98
+  br i1 %25, label %26, label %27
+
+26:                                               ; preds = %3
+  br label %29
+
+27:                                               ; preds = %3
+  %28 = load i32, i32* %10, align 4
+  br label %29
+
+29:                                               ; preds = %27, %26
+  %30 = phi i32 [ 98, %26 ], [ %28, %27 ]
+  store i32 %30, i32* %10, align 4
+  %31 = load i32, i32* %9, align 4
+  store i32 %31, i32* %7, align 4
+  br label %32
+
+32:                                               ; preds = %42, %29
+  %33 = load i32, i32* %7, align 4
+  %34 = load i32, i32* %10, align 4
+  %35 = icmp sle i32 %33, %34
+  br i1 %35, label %36, label %45
+
+36:                                               ; preds = %32
+  %37 = load i32, i32* %7, align 4
+  %38 = mul nsw i32 %37, 1
+  %39 = add nsw i32 1, %38
+  store i32 %39, i32* %13, align 4
+  %40 = load i32, i32* %13, align 4
+  store i32 %40, i32* %21, align 4
+  br label %41
+
+41:                                               ; preds = %36
+  br label %42
+
+42:                                               ; preds = %41
+  %43 = load i32, i32* %7, align 4
+  %44 = add nsw i32 %43, 1
+  store i32 %44, i32* %7, align 4
+  br label %32
+
+45:                                               ; preds = %32
+  br label %46
+
+46:                                               ; preds = %45
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23)
+  store i32 0, i32* %16, align 4
+  store i32 99, i32* %17, align 4
+  store i32 1, i32* %18, align 4
+  store i32 0, i32* %19, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1)
+  %47 = load i32, i32* %17, align 4
+  %48 = icmp sgt i32 %47, 99
+  br i1 %48, label %49, label %50
+
+49:                                               ; preds = %46
+  br label %52
+
+50:                                               ; preds = %46
+  %51 = load i32, i32* %17, align 4
+  br label %52
+
+52:                                               ; preds = %50, %49
+  %53 = phi i32 [ 99, %49 ], [ %51, %50 ]
+  store i32 %53, i32* %17, align 4
+  %54 = load i32, i32* %16, align 4
+  store i32 %54, i32* %14, align 4
+  br label %55
+
+55:                                               ; preds = %65, %52
+  %56 = load i32, i32* %14, align 4
+  %57 = load i32, i32* %17, align 4
+  %58 = icmp sle i32 %56, %57
+  br i1 %58, label %59, label %68
+
+59:                                               ; preds = %55
+  %60 = load i32, i32* %14, align 4
+  %61 = mul nsw i32 %60, 1
+  %62 = add nsw i32 0, %61
+  store i32 %62, i32* %20, align 4
+  %63 = load i32, i32* %20, align 4
+  store i32 %63, i32* %21, align 4
+  br label %64
+
+64:                                               ; preds = %59
+  br label %65
+
+65:                                               ; preds = %64
+  %66 = load i32, i32* %14, align 4
+  %67 = add nsw i32 %66, 1
+  store i32 %67, i32* %14, align 4
+  br label %55
+
+68:                                               ; preds = %55
+  br label %69
+
+69:                                               ; preds = %68
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23)
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone ssp uwtable
+define void @merge_some() #0 {
+  %1 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* %1)
+  ret void
+}
+
+;;;;; Test-3
+;void merge_some(){
+;        int a = 0;
+;        #pragma omp parallel
+;        {
+;                #pragma omp for
+;                for (int i=1; i < 100; i++)
+;                        a=i;
+;                #pragma omp for
+;                for (int j=0; j < 100; j++)
+;                        a=j;
+;                #pragma omp for
+;                for (int k=0; k < 100; k++)
+;                        a=k;
+;        }
+;}
+;;; The last two OMP for loops should be merged
+
+
+
+; Function Attrs: noinline norecurse nounwind optnone ssp uwtable
+define internal void @.omp_outlined..2(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 {
+  %4 = alloca i32*, align 8
+  %5 = alloca i32*, align 8
+  %6 = alloca i32*, align 8
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca i32, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca i32, align 4
+  %13 = alloca i32, align 4
+  %14 = alloca i32, align 4
+  %15 = alloca i32, align 4
+  %16 = alloca i32, align 4
+  %17 = alloca i32, align 4
+  %18 = alloca i32, align 4
+  %19 = alloca i32, align 4
+  %20 = alloca i32, align 4
+  %21 = alloca i32, align 4
+  %22 = alloca i32, align 4
+  %23 = alloca i32, align 4
+  %24 = alloca i32, align 4
+  %25 = alloca i32, align 4
+  %26 = alloca i32, align 4
+  %27 = alloca i32, align 4
+  store i32* %0, i32** %4, align 8
+  store i32* %1, i32** %5, align 8
+  store i32* %2, i32** %6, align 8
+  %28 = load i32*, i32** %6, align 8
+  store i32 0, i32* %9, align 4
+  store i32 98, i32* %10, align 4
+  store i32 1, i32* %11, align 4
+  store i32 0, i32* %12, align 4
+  %29 = load i32*, i32** %4, align 8
+  %30 = load i32, i32* %29, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1)
+  %31 = load i32, i32* %10, align 4
+  %32 = icmp sgt i32 %31, 98
+  br i1 %32, label %33, label %34
+
+33:                                               ; preds = %3
+  br label %36
+
+34:                                               ; preds = %3
+  %35 = load i32, i32* %10, align 4
+  br label %36
+
+36:                                               ; preds = %34, %33
+  %37 = phi i32 [ 98, %33 ], [ %35, %34 ]
+  store i32 %37, i32* %10, align 4
+  %38 = load i32, i32* %9, align 4
+  store i32 %38, i32* %7, align 4
+  br label %39
+
+39:                                               ; preds = %49, %36
+  %40 = load i32, i32* %7, align 4
+  %41 = load i32, i32* %10, align 4
+  %42 = icmp sle i32 %40, %41
+  br i1 %42, label %43, label %52
+
+43:                                               ; preds = %39
+  %44 = load i32, i32* %7, align 4
+  %45 = mul nsw i32 %44, 1
+  %46 = add nsw i32 1, %45
+  store i32 %46, i32* %13, align 4
+  %47 = load i32, i32* %13, align 4
+  store i32 %47, i32* %28, align 4
+  br label %48
+
+48:                                               ; preds = %43
+  br label %49
+
+49:                                               ; preds = %48
+  %50 = load i32, i32* %7, align 4
+  %51 = add nsw i32 %50, 1
+  store i32 %51, i32* %7, align 4
+  br label %39
+
+52:                                               ; preds = %39
+  br label %53
+
+53:                                               ; preds = %52
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30)
+  store i32 0, i32* %16, align 4
+  store i32 99, i32* %17, align 4
+  store i32 1, i32* %18, align 4
+  store i32 0, i32* %19, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1)
+  %54 = load i32, i32* %17, align 4
+  %55 = icmp sgt i32 %54, 99
+  br i1 %55, label %56, label %57
+
+56:                                               ; preds = %53
+  br label %59
+
+57:                                               ; preds = %53
+  %58 = load i32, i32* %17, align 4
+  br label %59
+
+59:                                               ; preds = %57, %56
+  %60 = phi i32 [ 99, %56 ], [ %58, %57 ]
+  store i32 %60, i32* %17, align 4
+  %61 = load i32, i32* %16, align 4
+  store i32 %61, i32* %14, align 4
+  br label %62
+
+62:                                               ; preds = %72, %59
+  %63 = load i32, i32* %14, align 4
+  %64 = load i32, i32* %17, align 4
+  %65 = icmp sle i32 %63, %64
+  br i1 %65, label %66, label %75
+
+66:                                               ; preds = %62
+  %67 = load i32, i32* %14, align 4
+  %68 = mul nsw i32 %67, 1
+  %69 = add nsw i32 0, %68
+  store i32 %69, i32* %20, align 4
+  %70 = load i32, i32* %20, align 4
+  store i32 %70, i32* %28, align 4
+  br label %71
+
+71:                                               ; preds = %66
+  br label %72
+
+72:                                               ; preds = %71
+  %73 = load i32, i32* %14, align 4
+  %74 = add nsw i32 %73, 1
+  store i32 %74, i32* %14, align 4
+  br label %62
+
+75:                                               ; preds = %62
+  br label %76
+
+76:                                               ; preds = %75
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30)
+  store i32 0, i32* %23, align 4
+  store i32 99, i32* %24, align 4
+  store i32 1, i32* %25, align 4
+  store i32 0, i32* %26, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %26, i32* %23, i32* %24, i32* %25, i32 1, i32 1)
+  %77 = load i32, i32* %24, align 4
+  %78 = icmp sgt i32 %77, 99
+  br i1 %78, label %79, label %80
+
+79:                                               ; preds = %76
+  br label %82
+
+80:                                               ; preds = %76
+  %81 = load i32, i32* %24, align 4
+  br label %82
+
+82:                                               ; preds = %80, %79
+  %83 = phi i32 [ 99, %79 ], [ %81, %80 ]
+  store i32 %83, i32* %24, align 4
+  %84 = load i32, i32* %23, align 4
+  store i32 %84, i32* %21, align 4
+  br label %85
+
+85:                                               ; preds = %95, %82
+  %86 = load i32, i32* %21, align 4
+  %87 = load i32, i32* %24, align 4
+  %88 = icmp sle i32 %86, %87
+  br i1 %88, label %89, label %98
+
+89:                                               ; preds = %85
+  %90 = load i32, i32* %21, align 4
+  %91 = mul nsw i32 %90, 1
+  %92 = add nsw i32 0, %91
+  store i32 %92, i32* %27, align 4
+  %93 = load i32, i32* %27, align 4
+  store i32 %93, i32* %28, align 4
+  br label %94
+
+94:                                               ; preds = %89
+  br label %95
+
+95:                                               ; preds = %94
+  %96 = load i32, i32* %21, align 4
+  %97 = add nsw i32 %96, 1
+  store i32 %97, i32* %21, align 4
+  br label %85
+
+98:                                               ; preds = %85
+  br label %99
+
+99:                                               ; preds = %98
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30)
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30)
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone ssp uwtable
+define void @merge_conditional(i32 %0) #0 {
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  store i32 %0, i32* %2, align 4
+  store i32 0, i32* %3, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+;;;;; Test-4
+;void merge_conditional(int x){
+;        int a = 0;
+;        #pragma omp parallel
+;        {
+;                if (x < 10)
+;                {
+;                        #pragma omp for
+;                        for (int i=0; i < 100; i++)
+;                                a=i;
+;                        #pragma omp for
+;                        for (int j=0; j < 100; j++)
+;                                a=j;
+;                }
+;                else
+;                {
+;                        #pragma omp for
+;                         for (int k=0; k < 100; k++)
+;                                a=k;
+;                }
+;        }
+;}
+;;; The OMP for loops in the first conditional block should be merged
+
+
+
+
+
+; Function Attrs: noinline norecurse nounwind optnone ssp uwtable
+define internal void @.omp_outlined..3(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2, i32* nonnull align 4 dereferenceable(4) %3) #1 {
+  %5 = alloca i32*, align 8
+  %6 = alloca i32*, align 8
+  %7 = alloca i32*, align 8
+  %8 = alloca i32*, align 8
+  %9 = alloca i32, align 4
+  %10 = alloca i32, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca i32, align 4
+  %13 = alloca i32, align 4
+  %14 = alloca i32, align 4
+  %15 = alloca i32, align 4
+  %16 = alloca i32, align 4
+  %17 = alloca i32, align 4
+  %18 = alloca i32, align 4
+  %19 = alloca i32, align 4
+  %20 = alloca i32, align 4
+  %21 = alloca i32, align 4
+  %22 = alloca i32, align 4
+  %23 = alloca i32, align 4
+  %24 = alloca i32, align 4
+  %25 = alloca i32, align 4
+  %26 = alloca i32, align 4
+  %27 = alloca i32, align 4
+  %28 = alloca i32, align 4
+  %29 = alloca i32, align 4
+  store i32* %0, i32** %5, align 8
+  store i32* %1, i32** %6, align 8
+  store i32* %2, i32** %7, align 8
+  store i32* %3, i32** %8, align 8
+  %30 = load i32*, i32** %7, align 8
+  %31 = load i32*, i32** %8, align 8
+  %32 = load i32, i32* %30, align 4
+  %33 = icmp slt i32 %32, 10
+  br i1 %33, label %34, label %93
+
+34:                                               ; preds = %4
+  store i32 0, i32* %11, align 4
+  store i32 99, i32* %12, align 4
+  store i32 1, i32* %13, align 4
+  store i32 0, i32* %14, align 4
+  %35 = load i32*, i32** %5, align 8
+  %36 = load i32, i32* %35, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %36, i32 34, i32* %14, i32* %11, i32* %12, i32* %13, i32 1, i32 1)
+  %37 = load i32, i32* %12, align 4
+  %38 = icmp sgt i32 %37, 99
+  br i1 %38, label %39, label %40
+
+39:                                               ; preds = %34
+  br label %42
+
+40:                                               ; preds = %34
+  %41 = load i32, i32* %12, align 4
+  br label %42
+
+42:                                               ; preds = %40, %39
+  %43 = phi i32 [ 99, %39 ], [ %41, %40 ]
+  store i32 %43, i32* %12, align 4
+  %44 = load i32, i32* %11, align 4
+  store i32 %44, i32* %9, align 4
+  br label %45
+
+45:                                               ; preds = %55, %42
+  %46 = load i32, i32* %9, align 4
+  %47 = load i32, i32* %12, align 4
+  %48 = icmp sle i32 %46, %47
+  br i1 %48, label %49, label %58
+
+49:                                               ; preds = %45
+  %50 = load i32, i32* %9, align 4
+  %51 = mul nsw i32 %50, 1
+  %52 = add nsw i32 0, %51
+  store i32 %52, i32* %15, align 4
+  %53 = load i32, i32* %15, align 4
+  store i32 %53, i32* %31, align 4
+  br label %54
+
+54:                                               ; preds = %49
+  br label %55
+
+55:                                               ; preds = %54
+  %56 = load i32, i32* %9, align 4
+  %57 = add nsw i32 %56, 1
+  store i32 %57, i32* %9, align 4
+  br label %45
+
+58:                                               ; preds = %45
+  br label %59
+
+59:                                               ; preds = %58
+  %60 = load i32*, i32** %5, align 8
+  %61 = load i32, i32* %60, align 4
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %61)
+  %62 = load i32*, i32** %5, align 8
+  %63 = load i32, i32* %62, align 4
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %63)
+  store i32 0, i32* %18, align 4
+  store i32 99, i32* %19, align 4
+  store i32 1, i32* %20, align 4
+  store i32 0, i32* %21, align 4
+  %64 = load i32*, i32** %5, align 8
+  %65 = load i32, i32* %64, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %65, i32 34, i32* %21, i32* %18, i32* %19, i32* %20, i32 1, i32 1)
+  %66 = load i32, i32* %19, align 4
+  %67 = icmp sgt i32 %66, 99
+  br i1 %67, label %68, label %69
+
+68:                                               ; preds = %59
+  br label %71
+
+69:                                               ; preds = %59
+  %70 = load i32, i32* %19, align 4
+  br label %71
+
+71:                                               ; preds = %69, %68
+  %72 = phi i32 [ 99, %68 ], [ %70, %69 ]
+  store i32 %72, i32* %19, align 4
+  %73 = load i32, i32* %18, align 4
+  store i32 %73, i32* %16, align 4
+  br label %74
+
+74:                                               ; preds = %84, %71
+  %75 = load i32, i32* %16, align 4
+  %76 = load i32, i32* %19, align 4
+  %77 = icmp sle i32 %75, %76
+  br i1 %77, label %78, label %87
+
+78:                                               ; preds = %74
+  %79 = load i32, i32* %16, align 4
+  %80 = mul nsw i32 %79, 1
+  %81 = add nsw i32 0, %80
+  store i32 %81, i32* %22, align 4
+  %82 = load i32, i32* %22, align 4
+  store i32 %82, i32* %31, align 4
+  br label %83
+
+83:                                               ; preds = %78
+  br label %84
+
+84:                                               ; preds = %83
+  %85 = load i32, i32* %16, align 4
+  %86 = add nsw i32 %85, 1
+  store i32 %86, i32* %16, align 4
+  br label %74
+
+87:                                               ; preds = %74
+  br label %88
+
+88:                                               ; preds = %87
+  %89 = load i32*, i32** %5, align 8
+  %90 = load i32, i32* %89, align 4
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %90)
+  %91 = load i32*, i32** %5, align 8
+  %92 = load i32, i32* %91, align 4
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %92)
+  br label %123
+
+93:                                               ; preds = %4
+  store i32 0, i32* %25, align 4
+  store i32 99, i32* %26, align 4
+  store i32 1, i32* %27, align 4
+  store i32 0, i32* %28, align 4
+  %94 = load i32*, i32** %5, align 8
+  %95 = load i32, i32* %94, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %95, i32 34, i32* %28, i32* %25, i32* %26, i32* %27, i32 1, i32 1)
+  %96 = load i32, i32* %26, align 4
+  %97 = icmp sgt i32 %96, 99
+  br i1 %97, label %98, label %99
+
+98:                                               ; preds = %93
+  br label %101
+
+99:                                               ; preds = %93
+  %100 = load i32, i32* %26, align 4
+  br label %101
+
+101:                                              ; preds = %99, %98
+  %102 = phi i32 [ 99, %98 ], [ %100, %99 ]
+  store i32 %102, i32* %26, align 4
+  %103 = load i32, i32* %25, align 4
+  store i32 %103, i32* %23, align 4
+  br label %104
+
+104:                                              ; preds = %114, %101
+  %105 = load i32, i32* %23, align 4
+  %106 = load i32, i32* %26, align 4
+  %107 = icmp sle i32 %105, %106
+  br i1 %107, label %108, label %117
+
+108:                                              ; preds = %104
+  %109 = load i32, i32* %23, align 4
+  %110 = mul nsw i32 %109, 1
+  %111 = add nsw i32 0, %110
+  store i32 %111, i32* %29, align 4
+  %112 = load i32, i32* %29, align 4
+  store i32 %112, i32* %31, align 4
+  br label %113
+
+113:                                              ; preds = %108
+  br label %114
+
+114:                                              ; preds = %113
+  %115 = load i32, i32* %23, align 4
+  %116 = add nsw i32 %115, 1
+  store i32 %116, i32* %23, align 4
+  br label %104
+
+117:                                              ; preds = %104
+  br label %118
+
+118:                                              ; preds = %117
+  %119 = load i32*, i32** %5, align 8
+  %120 = load i32, i32* %119, align 4
+  call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %120)
+  %121 = load i32*, i32** %5, align 8
+  %122 = load i32, i32* %121, align 4
+  call void @__kmpc_barrier(%struct.ident_t* @2, i32 %122)
+  br label %123
+
+123:                                              ; preds = %118, %88
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone ssp uwtable
+define i32 @main() #0 {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  %3 = call i32 (i8*, ...) @scanf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), i32* %2)
+  call void @merge_all()
+  call void @merge_none()
+  call void @merge_some()
+  %4 = load i32, i32* %2, align 4
+  call void @merge_conditional(i32 %4)
+  ret i32 0
+}
+
+declare i32 @scanf(i8*, ...) #4
+
+attributes #0 = { noinline nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+attributes #3 = { convergent nounwind }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 15]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 7, !"PIC Level", i32 2}
+!3 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 979bcbd3a6f7ea784f2098ad4cf613fbd6b09e38)"}
+!4 = !{!5}
+!5 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+
+;CHECK-LABEL: define void @merge_all() local_unnamed_addr #0{
+;CHECK-NEXT:  [[TMP2:%.*]] = alloca i32, align 4
+;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]])
+;CHECK: define internal void @.omp_outlined.(
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: ret void
+;CHECK-LABEL: define void @merge_none() local_unnamed_addr #0{
+;CHECK-NEXT:  [[TMP2:%.*]] = alloca i32, align 4
+;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]])
+;CHECK: define internal void @.omp_outlined..1(
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: ret void
+;CHECK-LABEL: define void @merge_some() local_unnamed_addr #0{
+;CHECK-NEXT:  [[TMP2:%.*]] = alloca i32, align 4
+;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]])
+;CHECK: define internal void @.omp_outlined..2(
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: ret void
+;CHECK-LABEL: define void @merge_conditional(i32 [[TMP1:%.*]]) local_unnamed_addr #0 {
+;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
+;CHECK-NEXT: [[TMP3:%.*]] = alloca i32, align 4
+;CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP2]], align 4, !tbaa !4
+;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nonnull %2, i32* nonnull [[TMP3]])
+;CHECK: br i1 [[TMP4:%.*]], label [[TMP5:%.*]], label [[TMP6:%.*]]
+;CHECK-NEXT: [[TMP5]]:
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: [[TMP6]]:
+;CHECK: call void @__kmpc_for_static_init_4(
+;CHECK: call void @__kmpc_for_static_fini(
+;CHECK: call void @__kmpc_barrier(
+;CHECK: ret void
diff --git a/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp b/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c -std=c99 -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+void test_1(){
+
+#pragma omp parallel
+{
+	#pragma omp for 
+	for (int i=0; i < 100; i++)
+	;
+	#pragma omp for 
+	for (int j=0; j < 10; j++)
+	;
+	#pragma omp for
+        for (int i=0; i < 10; i++)
+	;
+	#pragma omp for
+        for (int i=0; i < 10; i++)
+	;
+}
+	// The first parallel for loop will not be merged
+	// The last three parallel for loops will be merged
+}
+
+
+// CHECK: define void @test_1()
+// CHECK: ...) @__kmpc_for_call(
+// CHECK: ret void
+// CHECK: define internal void @.omp_outlined.(
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-NEXT: call void @__kmpc_barrier(
+// CHECK call void @__kmpc_for_static_init_4(
+// CHECK call void @__kmpc_barrier(
+// CHECK call void @__kmpc_barrier(
+// CHECK call void @__kmpc_for_static_fini(
+// CHECK-NEXT call void @__kmpc_barrier(
+// CHECK : ret void
diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -94,6 +94,7 @@
 
   static kmp_int8 warn = 0;
 
+
   if (ompt_enabled.ompt_callback_work) {
     // Only fully initialize variables needed by OMPT if OMPT is enabled.
     team_info = __ompt_get_teaminfo(0, NULL);