diff --git a/.arcconfig b/.arcconfig --- a/.arcconfig +++ b/.arcconfig @@ -1,5 +1,5 @@ { - "phabricator.uri" : "https://reviews.llvm.org/", + "repository.callsign" : "G", "conduit_uri" : "https://reviews.llvm.org/" } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -194,6 +194,8 @@ __OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) +__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, Int32, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32 ) +__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32) __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -21,10 +21,12 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/CFG.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include using namespace llvm; using namespace omp; @@ -61,6 +63,25 @@ OMPBuilder.initialize(); } + /// Data structure to hold information for the deleting + /// redundent OpenMP for loop calls + struct OMPLoopFusion { + bool check=false; + /// Keeps map of __kmpc_static_init4 and its __kmpc_static_fini calls for each OpenMP for loop + std::map call_init_fini_mapping; + std::map call_basicblock_mapping; + /// Keeps map of __kmpc_static_init4 and all its compatilable __kmpc_static_init4 in a vector + std::map> call_map; + std::map> call_arg; + /// the data structure maintain the basic blocks in a lineage + std::map> chain; + std::vector visited, loopVisited; + /// store_op0_op01 keeps map of operand 1 and operand 0 + /// args_map keeps map of arguments of __kmpc_static_init4 for later cleaning + std::map store_op0_op1, args_map; + CallInst *current_call_init_instruction = nullptr; + }; + /// Generic information that describes a runtime function struct RuntimeFunctionInfo { /// The kind, as described by the RuntimeFunction enum. @@ -107,18 +128,306 @@ /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. bool run() { bool Changed = false; - LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() << " functions in a slice with " << ModuleSlice.size() << " functions\n"); Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= deleteStaticScheduleCalls(); return Changed; } private: + /// Combine "OpenMP for loop with static scheduling" + /// check if all parameters are same and the loops are adjacent + /// See https://openmp.llvm.org/Reference.pdf. See section 5.8.3.24 for parameters + /// The two for loops can share the same __kmpc_static_init4() and __kmpc_static_fini() + /// calls. + + bool deleteStaticScheduleCalls() { + bool Changed = false; + // if there is no kmpc_for_static_init_4, there is no need to do anything + RuntimeFunctionInfo &RFI = RFIs[OMPRTL___kmpc_for_static_init_4]; + if (!RFI.Declaration) + return Changed; + // Else go through each function + OMPLoopFusion OLF; + for (Function *F : SCC) + Changed = runOverTheBlock(*F, &OLF); + return Changed; + } + +// Check the compatility of the of the __kmpc_for_static_init_4 + void checkTheCompatibility(OMPLoopFusion *OLF){ + bool compatible = true; + for (auto itr : OLF->call_init_fini_mapping) { + if (find(itr.first, OLF->call_map)) continue; + std::vector v; + std::vector v1; + for (Value *arg : (itr.first)->args()) + v1.push_back(arg); + for (auto itr1 : OLF->call_init_fini_mapping) { + if ((itr.first) == (itr1.first)) continue; + if (find(itr1.first, OLF->call_map)) continue; + std::vector v2; + for (Value *arg2 : (itr1.first)->args()) + v2.push_back(arg2); + for (auto i = v1.begin(), j = v2.begin(); i != v1.end() && j != v2.end(); ++i, ++j) { + if (isa(*i) && isa(*j)) { + if (*i != *j) {compatible = false; break;} + } + else { + if (OLF->store_op0_op1.find(*j)->second != OLF->store_op0_op1.find(*i)->second) { + compatible = false; break;} + } + } + if (compatible) { + for (auto i = v1.begin(), j = v2.begin(); i != v1.end() && j != v2.end(); ++i, ++j) { + OLF->args_map.insert({*j,*i}); + } + v.push_back(itr1.first); + } + else break; /// the adjacent for omp loop is not compatible so there is no need to check others + /// therefore we need to break out of the second for loop + } + /// if a call instruction has some compatible call instructions then put in the call_map container + OLF->call_map.insert({itr.first, v}); + /// make the flag true again for the next instruction checking + if (!compatible) compatible = true; + v.clear(); + } + } + + bool checkForOMPInit(BasicBlock* B){ + if (!B) return false; + for (BasicBlock::iterator BBI=B->begin(); BBI !=B->end(); ++BBI){ + if (CallInst *c= dyn_cast(BBI)){ + if (c->getCalledFunction()->getName()=="__kmpc_for_static_init_4"){ + return true;} + } + } + return false; + } + + bool checkForOMPFini(BasicBlock* B){ + if (!B) return false; + for (BasicBlock::iterator BBI=B->begin(); BBI !=B->end(); ++BBI){ + if (CallInst *c= dyn_cast(BBI)){ + if (c->getCalledFunction()->getName()=="__kmpc_for_static_fini"){ + return true;} + } + } + return false; + } + + void markNodeVisited(BasicBlock* B,std::vector &v,OMPLoopFusion *OLF){ + if (!B) return; + OLF->visited.push_back(B); + v.push_back(B); + for ( auto BB: successors(B)){ + if (find(OLF->visited,BB)) continue; + markNodeVisited(BB,v, OLF); + } + } + + BasicBlock* checkTheLoop(BasicBlock* B,std::vector &v, OMPLoopFusion *OLF){ + std::vector v2; + for (auto S: successors(B)){ + if (checkLoop(S, B, v2)) { + // mark all the node as visited + markNodeVisited(S,v,OLF); + return nullptr;} + else + return S; + } + return nullptr; + } + + bool checkLoop(BasicBlock* S, BasicBlock* B, std::vector& visit){ + bool loop = false; + if (!S) return loop; + for (auto BB: successors(S)){ + if (BB == B) {loop = true; break;} + if (find(visit, BB)) continue; + visit.push_back(BB); + loop = (loop || checkLoop (BB, B, visit)); + } + return loop; + } + + + int countSuccessors(BasicBlock* B){ + int count = 0; + for (auto BS: successors(B)) // I should use iterator instead + count++; + return count; + } + int countPredessors(BasicBlock* B){ + int count = 0; + for (auto BP: predecessors(B)) + count++; + return count; + } + void makeLineage(BasicBlock *B, std::vector &v, OMPLoopFusion *OLF){ + if (!B or find(OLF->visited, B) ) return; + if ((countSuccessors(B) <=1 ) && (countPredessors(B) > 1)) return; // unique entrance with two control flows + if ((countPredessors(B) <=1 ) && (countSuccessors(B)) > 1) return; // two control flows merging into a unique point + // these points can not be part of lineage for the optimizations + BasicBlock* t=nullptr; + // If you have a basic blokc try to find the omp for starting point + if (B->getSingleSuccessor()){ + OLF->visited.push_back(B); + v.push_back(B); + if (checkForOMPInit(B)) // if you find it then find the end points ; all inbetween points are are part of the lineage + t=checkOMPForLoop(B->getSingleSuccessor(), v, OLF);// the output is the basicblock for building the lineage + else + t=B->getSingleSuccessor();}// else take the successor and move on + else {// if you have a codition with more than two successors and predecessors + // we need to check if they are control points or inbetween for loops + OLF->visited.push_back(B); + t = checkTheLoop(B, v, OLF) ; + v.push_back(B); + } + makeLineage(t, v, OLF); + return; + } + + + BasicBlock* checkOMPForLoop(BasicBlock *BB,std::vector &v, OMPLoopFusion *OLF){ + BasicBlock * t = nullptr; + if (!BB) return t; + OLF->visited.push_back(BB); + v.push_back(BB); + for (auto B: successors(BB)){ + if (find(OLF->visited, B)) continue; + if (checkForOMPFini(B)) { t= B; continue;} + checkOMPForLoop (B, v, OLF); + } + return t; + } + + + bool find(std::vector b, BasicBlock* B){ + for ( auto t: b) + if (t == B) return true; + return false; + } + + bool find(CallInst *I, std::map> m) { + for (auto itr :m){ + if (itr.first== I) return true; + for (auto itr1 : (itr.second)) + if (I == itr1) return true; + } + return false; + } + + void clean_intrinsic_calls(BasicBlock* B, OMPLoopFusion *OLF){ + std::vector remove; + for (BasicBlock::iterator DI = B->begin(); DI != B->end(); ++DI ) { + if (IntrinsicInst *II = dyn_cast (DI)){ + if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end ){ + remove.push_back(II); + } + } + } + for (auto r: remove) + r->eraseFromParent(); + } + + void check_call_instructions(BasicBlock* B, OMPLoopFusion *OLF){ + for (BasicBlock::iterator DI = B->begin(); DI != B->end(); ++DI ) { + if (CallInst *c = dyn_cast(DI)) { + if (c->getCalledFunction()->getName() == "__kmpc_for_static_init_4") + OLF->current_call_init_instruction = c; + if (c->getCalledFunction()->getName() == "__kmpc_for_static_fini") + OLF->call_init_fini_mapping.insert({OLF->current_call_init_instruction, c}); + } + if (StoreInst *store = dyn_cast(DI)) + OLF->store_op0_op1.insert({store->getOperand(1), store->getOperand(0)}); + } + } + + bool runOverTheBlock(Function &F, OMPLoopFusion *OLF) { + std::vector v; + bool changed = false; + for (auto &BB: F) { + // on each block prepare data structure for the instructions + if (find (OLF->visited, &BB)) continue; + makeLineage (&BB, v, OLF); + OLF->chain.insert({&BB,v}); + v.clear(); + } + changed = doTheOptimization(OLF);// act on the formed lineages + + return changed; + } + + bool doTheOptimization(OMPLoopFusion *OLF){ + bool changed = false; + for (auto S: OLF->chain){ + //we have todo it for each lineage + //B is a basic block in a lineage + for ( auto B:S.second){ + check_call_instructions(B, OLF); + } + checkTheCompatibility(OLF); + changed = cleanInstructions(OLF); + if (changed) + for (auto B:S.second){ + replace_UseValues(B, OLF); + clean_intrinsic_calls(B, OLF); + } + OLF->call_init_fini_mapping.clear(); + OLF->call_map.clear(); + OLF->store_op0_op1.clear(); + OLF->args_map.clear(); + + } + return changed; + } + + void replace_UseValues(BasicBlock* B, OMPLoopFusion *OLF){ + std::vector remove; + for (BasicBlock::iterator II = B->begin(); II != B->end(); ++II) { + Instruction *It = dyn_cast(II); + if (isa(It)) continue; + for (unsigned int k = 0; k < It->getNumOperands(); k++){ + auto temp = OLF->args_map.find(It->getOperand(k)); + if (temp != OLF->args_map.end()){ + It->setOperand(k, temp->second); + if (isa(It) && k > 0) remove.push_back(It); + } + } + } + for (auto r: remove) + r->eraseFromParent(); + } + + bool cleanInstructions(OMPLoopFusion *OLF) { + bool changed = false; + for (auto itr : OLF->call_map) { + int count = (itr.second).size(); + if (!count) continue; + Instruction *I = OLF->call_init_fini_mapping.find(itr.first)->second; + I->eraseFromParent(); + changed = true; + for (auto itr1:itr.second) { + Instruction *I1 = itr1; + Instruction *I2 = OLF->call_init_fini_mapping.find(itr1)->second; + I1->eraseFromParent(); + if (count == 1) break; + I2->eraseFromParent(); + count--; + } + } + return changed; + } + + + /// Try to delete parallel regions if possible bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll b/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll @@ -0,0 +1,909 @@ +; The IR is produced using << -fopenmp -emit-llvm -S -c parallelMergeForLoop.c -o parallelMergeForLoop.ll >> flags +; RUN: opt -S -attributor -openmpopt -O3 < %s | FileCheck %s +; ModuleID = 'parallelMergeForLoop.c' +source_filename = "parallelMergeForLoop.c" +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.15.0" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@.str = private unnamed_addr constant [3 x i8] c"%d\00", align 1 + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_all() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %1) + ret void +} +;;;;; Test-1 +;void merge_all(){ +; int a=0; +; #pragma omp parallel +; { +; #pragma omp for +; for (int i=0; i < 100; i++) +; a=i; +; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; } +;} +;;; Both the loops should be merged + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined.(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 { + %4 = alloca i32*, align 8 + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + store i32* %0, i32** %4, align 8 + store i32* %1, i32** %5, align 8 + store i32* %2, i32** %6, align 8 + %21 = load i32*, i32** %6, align 8 + store i32 0, i32* %9, align 4 + store i32 99, i32* %10, align 4 + store i32 1, i32* %11, align 4 + store i32 0, i32* %12, align 4 + %22 = load i32*, i32** %4, align 8 + %23 = load i32, i32* %22, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1) + %24 = load i32, i32* %10, align 4 + %25 = icmp sgt i32 %24, 99 + br i1 %25, label %26, label %27 + +26: ; preds = %3 + br label %29 + +27: ; preds = %3 + %28 = load i32, i32* %10, align 4 + br label %29 + +29: ; preds = %27, %26 + %30 = phi i32 [ 99, %26 ], [ %28, %27 ] + store i32 %30, i32* %10, align 4 + %31 = load i32, i32* %9, align 4 + store i32 %31, i32* %7, align 4 + br label %32 + +32: ; preds = %42, %29 + %33 = load i32, i32* %7, align 4 + %34 = load i32, i32* %10, align 4 + %35 = icmp sle i32 %33, %34 + br i1 %35, label %36, label %45 + +36: ; preds = %32 + %37 = load i32, i32* %7, align 4 + %38 = mul nsw i32 %37, 1 + %39 = add nsw i32 0, %38 + store i32 %39, i32* %13, align 4 + %40 = load i32, i32* %13, align 4 + store i32 %40, i32* %21, align 4 + br label %41 + +41: ; preds = %36 + br label %42 + +42: ; preds = %41 + %43 = load i32, i32* %7, align 4 + %44 = add nsw i32 %43, 1 + store i32 %44, i32* %7, align 4 + br label %32 + +45: ; preds = %32 + br label %46 + +46: ; preds = %45 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + store i32 0, i32* %16, align 4 + store i32 99, i32* %17, align 4 + store i32 1, i32* %18, align 4 + store i32 0, i32* %19, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1) + %47 = load i32, i32* %17, align 4 + %48 = icmp sgt i32 %47, 99 + br i1 %48, label %49, label %50 + +49: ; preds = %46 + br label %52 + +50: ; preds = %46 + %51 = load i32, i32* %17, align 4 + br label %52 + +52: ; preds = %50, %49 + %53 = phi i32 [ 99, %49 ], [ %51, %50 ] + store i32 %53, i32* %17, align 4 + %54 = load i32, i32* %16, align 4 + store i32 %54, i32* %14, align 4 + br label %55 + +55: ; preds = %65, %52 + %56 = load i32, i32* %14, align 4 + %57 = load i32, i32* %17, align 4 + %58 = icmp sle i32 %56, %57 + br i1 %58, label %59, label %68 + +59: ; preds = %55 + %60 = load i32, i32* %14, align 4 + %61 = mul nsw i32 %60, 1 + %62 = add nsw i32 0, %61 + store i32 %62, i32* %20, align 4 + %63 = load i32, i32* %20, align 4 + store i32 %63, i32* %21, align 4 + br label %64 + +64: ; preds = %59 + br label %65 + +65: ; preds = %64 + %66 = load i32, i32* %14, align 4 + %67 = add nsw i32 %66, 1 + store i32 %67, i32* %14, align 4 + br label %55 + +68: ; preds = %55 + br label %69 + +69: ; preds = %68 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: nounwind +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) #2 + +; Function Attrs: convergent nounwind +declare void @__kmpc_barrier(%struct.ident_t*, i32) #3 + +; Function Attrs: nounwind +declare !callback !4 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #2 + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_none() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* %1) + ret void +} + +;;;;; Test-2 +;void merge_none(){ +; int a=0; +; #pragma omp parallel +; { +; #pragma omp for +; for (int i=1; i < 100; i++) +; a=i; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; } +;} +;;; The two OMP for loops should not be merged + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined..1(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 { + %4 = alloca i32*, align 8 + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + store i32* %0, i32** %4, align 8 + store i32* %1, i32** %5, align 8 + store i32* %2, i32** %6, align 8 + %21 = load i32*, i32** %6, align 8 + store i32 0, i32* %9, align 4 + store i32 98, i32* %10, align 4 + store i32 1, i32* %11, align 4 + store i32 0, i32* %12, align 4 + %22 = load i32*, i32** %4, align 8 + %23 = load i32, i32* %22, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1) + %24 = load i32, i32* %10, align 4 + %25 = icmp sgt i32 %24, 98 + br i1 %25, label %26, label %27 + +26: ; preds = %3 + br label %29 + +27: ; preds = %3 + %28 = load i32, i32* %10, align 4 + br label %29 + +29: ; preds = %27, %26 + %30 = phi i32 [ 98, %26 ], [ %28, %27 ] + store i32 %30, i32* %10, align 4 + %31 = load i32, i32* %9, align 4 + store i32 %31, i32* %7, align 4 + br label %32 + +32: ; preds = %42, %29 + %33 = load i32, i32* %7, align 4 + %34 = load i32, i32* %10, align 4 + %35 = icmp sle i32 %33, %34 + br i1 %35, label %36, label %45 + +36: ; preds = %32 + %37 = load i32, i32* %7, align 4 + %38 = mul nsw i32 %37, 1 + %39 = add nsw i32 1, %38 + store i32 %39, i32* %13, align 4 + %40 = load i32, i32* %13, align 4 + store i32 %40, i32* %21, align 4 + br label %41 + +41: ; preds = %36 + br label %42 + +42: ; preds = %41 + %43 = load i32, i32* %7, align 4 + %44 = add nsw i32 %43, 1 + store i32 %44, i32* %7, align 4 + br label %32 + +45: ; preds = %32 + br label %46 + +46: ; preds = %45 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + store i32 0, i32* %16, align 4 + store i32 99, i32* %17, align 4 + store i32 1, i32* %18, align 4 + store i32 0, i32* %19, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1) + %47 = load i32, i32* %17, align 4 + %48 = icmp sgt i32 %47, 99 + br i1 %48, label %49, label %50 + +49: ; preds = %46 + br label %52 + +50: ; preds = %46 + %51 = load i32, i32* %17, align 4 + br label %52 + +52: ; preds = %50, %49 + %53 = phi i32 [ 99, %49 ], [ %51, %50 ] + store i32 %53, i32* %17, align 4 + %54 = load i32, i32* %16, align 4 + store i32 %54, i32* %14, align 4 + br label %55 + +55: ; preds = %65, %52 + %56 = load i32, i32* %14, align 4 + %57 = load i32, i32* %17, align 4 + %58 = icmp sle i32 %56, %57 + br i1 %58, label %59, label %68 + +59: ; preds = %55 + %60 = load i32, i32* %14, align 4 + %61 = mul nsw i32 %60, 1 + %62 = add nsw i32 0, %61 + store i32 %62, i32* %20, align 4 + %63 = load i32, i32* %20, align 4 + store i32 %63, i32* %21, align 4 + br label %64 + +64: ; preds = %59 + br label %65 + +65: ; preds = %64 + %66 = load i32, i32* %14, align 4 + %67 = add nsw i32 %66, 1 + store i32 %67, i32* %14, align 4 + br label %55 + +68: ; preds = %55 + br label %69 + +69: ; preds = %68 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + ret void +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_some() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* %1) + ret void +} + +;;;;; Test-3 +;void merge_some(){ +; int a = 0; +; #pragma omp parallel +; { +; #pragma omp for +; for (int i=1; i < 100; i++) +; a=i; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; #pragma omp for +; for (int k=0; k < 100; k++) +; a=k; +; } +;} +;;; The last two OMP for loops should be merged + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined..2(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 { + %4 = alloca i32*, align 8 + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + %21 = alloca i32, align 4 + %22 = alloca i32, align 4 + %23 = alloca i32, align 4 + %24 = alloca i32, align 4 + %25 = alloca i32, align 4 + %26 = alloca i32, align 4 + %27 = alloca i32, align 4 + store i32* %0, i32** %4, align 8 + store i32* %1, i32** %5, align 8 + store i32* %2, i32** %6, align 8 + %28 = load i32*, i32** %6, align 8 + store i32 0, i32* %9, align 4 + store i32 98, i32* %10, align 4 + store i32 1, i32* %11, align 4 + store i32 0, i32* %12, align 4 + %29 = load i32*, i32** %4, align 8 + %30 = load i32, i32* %29, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1) + %31 = load i32, i32* %10, align 4 + %32 = icmp sgt i32 %31, 98 + br i1 %32, label %33, label %34 + +33: ; preds = %3 + br label %36 + +34: ; preds = %3 + %35 = load i32, i32* %10, align 4 + br label %36 + +36: ; preds = %34, %33 + %37 = phi i32 [ 98, %33 ], [ %35, %34 ] + store i32 %37, i32* %10, align 4 + %38 = load i32, i32* %9, align 4 + store i32 %38, i32* %7, align 4 + br label %39 + +39: ; preds = %49, %36 + %40 = load i32, i32* %7, align 4 + %41 = load i32, i32* %10, align 4 + %42 = icmp sle i32 %40, %41 + br i1 %42, label %43, label %52 + +43: ; preds = %39 + %44 = load i32, i32* %7, align 4 + %45 = mul nsw i32 %44, 1 + %46 = add nsw i32 1, %45 + store i32 %46, i32* %13, align 4 + %47 = load i32, i32* %13, align 4 + store i32 %47, i32* %28, align 4 + br label %48 + +48: ; preds = %43 + br label %49 + +49: ; preds = %48 + %50 = load i32, i32* %7, align 4 + %51 = add nsw i32 %50, 1 + store i32 %51, i32* %7, align 4 + br label %39 + +52: ; preds = %39 + br label %53 + +53: ; preds = %52 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30) + store i32 0, i32* %16, align 4 + store i32 99, i32* %17, align 4 + store i32 1, i32* %18, align 4 + store i32 0, i32* %19, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1) + %54 = load i32, i32* %17, align 4 + %55 = icmp sgt i32 %54, 99 + br i1 %55, label %56, label %57 + +56: ; preds = %53 + br label %59 + +57: ; preds = %53 + %58 = load i32, i32* %17, align 4 + br label %59 + +59: ; preds = %57, %56 + %60 = phi i32 [ 99, %56 ], [ %58, %57 ] + store i32 %60, i32* %17, align 4 + %61 = load i32, i32* %16, align 4 + store i32 %61, i32* %14, align 4 + br label %62 + +62: ; preds = %72, %59 + %63 = load i32, i32* %14, align 4 + %64 = load i32, i32* %17, align 4 + %65 = icmp sle i32 %63, %64 + br i1 %65, label %66, label %75 + +66: ; preds = %62 + %67 = load i32, i32* %14, align 4 + %68 = mul nsw i32 %67, 1 + %69 = add nsw i32 0, %68 + store i32 %69, i32* %20, align 4 + %70 = load i32, i32* %20, align 4 + store i32 %70, i32* %28, align 4 + br label %71 + +71: ; preds = %66 + br label %72 + +72: ; preds = %71 + %73 = load i32, i32* %14, align 4 + %74 = add nsw i32 %73, 1 + store i32 %74, i32* %14, align 4 + br label %62 + +75: ; preds = %62 + br label %76 + +76: ; preds = %75 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30) + store i32 0, i32* %23, align 4 + store i32 99, i32* %24, align 4 + store i32 1, i32* %25, align 4 + store i32 0, i32* %26, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %26, i32* %23, i32* %24, i32* %25, i32 1, i32 1) + %77 = load i32, i32* %24, align 4 + %78 = icmp sgt i32 %77, 99 + br i1 %78, label %79, label %80 + +79: ; preds = %76 + br label %82 + +80: ; preds = %76 + %81 = load i32, i32* %24, align 4 + br label %82 + +82: ; preds = %80, %79 + %83 = phi i32 [ 99, %79 ], [ %81, %80 ] + store i32 %83, i32* %24, align 4 + %84 = load i32, i32* %23, align 4 + store i32 %84, i32* %21, align 4 + br label %85 + +85: ; preds = %95, %82 + %86 = load i32, i32* %21, align 4 + %87 = load i32, i32* %24, align 4 + %88 = icmp sle i32 %86, %87 + br i1 %88, label %89, label %98 + +89: ; preds = %85 + %90 = load i32, i32* %21, align 4 + %91 = mul nsw i32 %90, 1 + %92 = add nsw i32 0, %91 + store i32 %92, i32* %27, align 4 + %93 = load i32, i32* %27, align 4 + store i32 %93, i32* %28, align 4 + br label %94 + +94: ; preds = %89 + br label %95 + +95: ; preds = %94 + %96 = load i32, i32* %21, align 4 + %97 = add nsw i32 %96, 1 + store i32 %97, i32* %21, align 4 + br label %85 + +98: ; preds = %85 + br label %99 + +99: ; preds = %98 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30) + ret void +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_conditional(i32 %0) #0 { + %2 = alloca i32, align 4 + %3 = alloca i32, align 4 + store i32 %0, i32* %2, align 4 + store i32 0, i32* %3, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +;;;;; Test-4 +;void merge_conditional(int x){ +; int a = 0; +; #pragma omp parallel +; { +; if (x < 10) +; { +; #pragma omp for +; for (int i=0; i < 100; i++) +; a=i; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; } +; else +; { +; #pragma omp for +; for (int k=0; k < 100; k++) +; a=k; +; } +; } +;} +;;; The OMP for loops in the first conditional block should be merged + + + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined..3(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2, i32* nonnull align 4 dereferenceable(4) %3) #1 { + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32*, align 8 + %8 = alloca i32*, align 8 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + %21 = alloca i32, align 4 + %22 = alloca i32, align 4 + %23 = alloca i32, align 4 + %24 = alloca i32, align 4 + %25 = alloca i32, align 4 + %26 = alloca i32, align 4 + %27 = alloca i32, align 4 + %28 = alloca i32, align 4 + %29 = alloca i32, align 4 + store i32* %0, i32** %5, align 8 + store i32* %1, i32** %6, align 8 + store i32* %2, i32** %7, align 8 + store i32* %3, i32** %8, align 8 + %30 = load i32*, i32** %7, align 8 + %31 = load i32*, i32** %8, align 8 + %32 = load i32, i32* %30, align 4 + %33 = icmp slt i32 %32, 10 + br i1 %33, label %34, label %93 + +34: ; preds = %4 + store i32 0, i32* %11, align 4 + store i32 99, i32* %12, align 4 + store i32 1, i32* %13, align 4 + store i32 0, i32* %14, align 4 + %35 = load i32*, i32** %5, align 8 + %36 = load i32, i32* %35, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %36, i32 34, i32* %14, i32* %11, i32* %12, i32* %13, i32 1, i32 1) + %37 = load i32, i32* %12, align 4 + %38 = icmp sgt i32 %37, 99 + br i1 %38, label %39, label %40 + +39: ; preds = %34 + br label %42 + +40: ; preds = %34 + %41 = load i32, i32* %12, align 4 + br label %42 + +42: ; preds = %40, %39 + %43 = phi i32 [ 99, %39 ], [ %41, %40 ] + store i32 %43, i32* %12, align 4 + %44 = load i32, i32* %11, align 4 + store i32 %44, i32* %9, align 4 + br label %45 + +45: ; preds = %55, %42 + %46 = load i32, i32* %9, align 4 + %47 = load i32, i32* %12, align 4 + %48 = icmp sle i32 %46, %47 + br i1 %48, label %49, label %58 + +49: ; preds = %45 + %50 = load i32, i32* %9, align 4 + %51 = mul nsw i32 %50, 1 + %52 = add nsw i32 0, %51 + store i32 %52, i32* %15, align 4 + %53 = load i32, i32* %15, align 4 + store i32 %53, i32* %31, align 4 + br label %54 + +54: ; preds = %49 + br label %55 + +55: ; preds = %54 + %56 = load i32, i32* %9, align 4 + %57 = add nsw i32 %56, 1 + store i32 %57, i32* %9, align 4 + br label %45 + +58: ; preds = %45 + br label %59 + +59: ; preds = %58 + %60 = load i32*, i32** %5, align 8 + %61 = load i32, i32* %60, align 4 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %61) + %62 = load i32*, i32** %5, align 8 + %63 = load i32, i32* %62, align 4 + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %63) + store i32 0, i32* %18, align 4 + store i32 99, i32* %19, align 4 + store i32 1, i32* %20, align 4 + store i32 0, i32* %21, align 4 + %64 = load i32*, i32** %5, align 8 + %65 = load i32, i32* %64, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %65, i32 34, i32* %21, i32* %18, i32* %19, i32* %20, i32 1, i32 1) + %66 = load i32, i32* %19, align 4 + %67 = icmp sgt i32 %66, 99 + br i1 %67, label %68, label %69 + +68: ; preds = %59 + br label %71 + +69: ; preds = %59 + %70 = load i32, i32* %19, align 4 + br label %71 + +71: ; preds = %69, %68 + %72 = phi i32 [ 99, %68 ], [ %70, %69 ] + store i32 %72, i32* %19, align 4 + %73 = load i32, i32* %18, align 4 + store i32 %73, i32* %16, align 4 + br label %74 + +74: ; preds = %84, %71 + %75 = load i32, i32* %16, align 4 + %76 = load i32, i32* %19, align 4 + %77 = icmp sle i32 %75, %76 + br i1 %77, label %78, label %87 + +78: ; preds = %74 + %79 = load i32, i32* %16, align 4 + %80 = mul nsw i32 %79, 1 + %81 = add nsw i32 0, %80 + store i32 %81, i32* %22, align 4 + %82 = load i32, i32* %22, align 4 + store i32 %82, i32* %31, align 4 + br label %83 + +83: ; preds = %78 + br label %84 + +84: ; preds = %83 + %85 = load i32, i32* %16, align 4 + %86 = add nsw i32 %85, 1 + store i32 %86, i32* %16, align 4 + br label %74 + +87: ; preds = %74 + br label %88 + +88: ; preds = %87 + %89 = load i32*, i32** %5, align 8 + %90 = load i32, i32* %89, align 4 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %90) + %91 = load i32*, i32** %5, align 8 + %92 = load i32, i32* %91, align 4 + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %92) + br label %123 + +93: ; preds = %4 + store i32 0, i32* %25, align 4 + store i32 99, i32* %26, align 4 + store i32 1, i32* %27, align 4 + store i32 0, i32* %28, align 4 + %94 = load i32*, i32** %5, align 8 + %95 = load i32, i32* %94, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %95, i32 34, i32* %28, i32* %25, i32* %26, i32* %27, i32 1, i32 1) + %96 = load i32, i32* %26, align 4 + %97 = icmp sgt i32 %96, 99 + br i1 %97, label %98, label %99 + +98: ; preds = %93 + br label %101 + +99: ; preds = %93 + %100 = load i32, i32* %26, align 4 + br label %101 + +101: ; preds = %99, %98 + %102 = phi i32 [ 99, %98 ], [ %100, %99 ] + store i32 %102, i32* %26, align 4 + %103 = load i32, i32* %25, align 4 + store i32 %103, i32* %23, align 4 + br label %104 + +104: ; preds = %114, %101 + %105 = load i32, i32* %23, align 4 + %106 = load i32, i32* %26, align 4 + %107 = icmp sle i32 %105, %106 + br i1 %107, label %108, label %117 + +108: ; preds = %104 + %109 = load i32, i32* %23, align 4 + %110 = mul nsw i32 %109, 1 + %111 = add nsw i32 0, %110 + store i32 %111, i32* %29, align 4 + %112 = load i32, i32* %29, align 4 + store i32 %112, i32* %31, align 4 + br label %113 + +113: ; preds = %108 + br label %114 + +114: ; preds = %113 + %115 = load i32, i32* %23, align 4 + %116 = add nsw i32 %115, 1 + store i32 %116, i32* %23, align 4 + br label %104 + +117: ; preds = %104 + br label %118 + +118: ; preds = %117 + %119 = load i32*, i32** %5, align 8 + %120 = load i32, i32* %119, align 4 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %120) + %121 = load i32*, i32** %5, align 8 + %122 = load i32, i32* %121, align 4 + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %122) + br label %123 + +123: ; preds = %118, %88 + ret void +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define i32 @main() #0 { + %1 = alloca i32, align 4 + %2 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + %3 = call i32 (i8*, ...) @scanf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), i32* %2) + call void @merge_all() + call void @merge_none() + call void @merge_some() + %4 = load i32, i32* %2, align 4 + call void @merge_conditional(i32 %4) + ret i32 0 +} + +declare i32 @scanf(i8*, ...) #4 + +attributes #0 = { noinline nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } +attributes #3 = { convergent nounwind } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 15]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"PIC Level", i32 2} +!3 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 979bcbd3a6f7ea784f2098ad4cf613fbd6b09e38)"} +!4 = !{!5} +!5 = !{i64 2, i64 -1, i64 -1, i1 true} + + +;CHECK-LABEL: define void @merge_all() local_unnamed_addr #0{ +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]]) +;CHECK: define internal void @.omp_outlined.( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void +;CHECK-LABEL: define void @merge_none() local_unnamed_addr #0{ +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]]) +;CHECK: define internal void @.omp_outlined..1( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void +;CHECK-LABEL: define void @merge_some() local_unnamed_addr #0{ +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]]) +;CHECK: define internal void @.omp_outlined..2( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void +;CHECK-LABEL: define void @merge_conditional(i32 [[TMP1:%.*]]) local_unnamed_addr #0 { +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK-NEXT: [[TMP3:%.*]] = alloca i32, align 4 +;CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP2]], align 4, !tbaa !4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nonnull %2, i32* nonnull [[TMP3]]) +;CHECK: br i1 [[TMP4:%.*]], label [[TMP5:%.*]], label [[TMP6:%.*]] +;CHECK-NEXT: [[TMP5]]: +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: [[TMP6]]: +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void diff --git a/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp b/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c -std=c99 -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics + +void test_1(){ + +#pragma omp parallel +{ + #pragma omp for + for (int i=0; i < 100; i++) + ; + #pragma omp for + for (int j=0; j < 10; j++) + ; + #pragma omp for + for (int i=0; i < 10; i++) + ; + #pragma omp for + for (int i=0; i < 10; i++) + ; +} + // The first parallel for loop will not be merged + // The last three parallel for loops will be merged +} + + +// CHECK: define void @test_1() +// CHECK: ...) @__kmpc_for_call( +// CHECK: ret void +// CHECK: define internal void @.omp_outlined.( +// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call void @__kmpc_for_static_fini( +// CHECK-NEXT: call void @__kmpc_barrier( +// CHECK call void @__kmpc_for_static_init_4( +// CHECK call void @__kmpc_barrier( +// CHECK call void @__kmpc_barrier( +// CHECK call void @__kmpc_for_static_fini( +// CHECK-NEXT call void @__kmpc_barrier( +// CHECK : ret void diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -94,6 +94,7 @@ static kmp_int8 warn = 0; + if (ompt_enabled.ompt_callback_work) { // Only fully initialize variables needed by OMPT if OMPT is enabled. team_info = __ompt_get_teaminfo(0, NULL);