diff --git a/.arcconfig b/.arcconfig --- a/.arcconfig +++ b/.arcconfig @@ -1,5 +1,5 @@ { - "phabricator.uri" : "https://reviews.llvm.org/", + "repository.callsign" : "G", "conduit_uri" : "https://reviews.llvm.org/" } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -194,6 +194,8 @@ __OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) +__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, Int32, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32 ) +__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32) __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -21,10 +21,12 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/CFG.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include using namespace llvm; using namespace omp; @@ -61,6 +63,25 @@ OMPBuilder.initialize(); } + /// Data structure to hold information for the deleting + /// redundent OpenMP for loop calls + struct OMPLoopFusion { + bool check=false; + /// Keeps map of __kmpc_static_init4 and its __kmpc_static_fini calls for each OpenMP for loop + std::map call_init_fini_mapping; + std::map call_basicblock_mapping; + /// Keeps map of __kmpc_static_init4 and all its compatilable __kmpc_static_init4 in a vector + std::map> call_map; + std::map> call_arg; + /// the data structure maintain the basic blocks in a lineage + std::map> chain; + std::vector visited, loopVisited; + /// store_op0_op01 keeps map of operand 1 and operand 0 + /// args_map keeps map of arguments of __kmpc_static_init4 for later cleaning + std::map store_op0_op1, args_map; + CallInst *current_call_init_instruction = nullptr; + }; + /// Generic information that describes a runtime function struct RuntimeFunctionInfo { /// The kind, as described by the RuntimeFunction enum. @@ -107,18 +128,306 @@ /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. bool run() { bool Changed = false; - LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() << " functions in a slice with " << ModuleSlice.size() << " functions\n"); Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= deleteStaticScheduleCalls(); return Changed; } private: + /// Combine "OpenMP for loop with static scheduling" + /// check if all parameters are same and the loops are adjacent + /// See https://openmp.llvm.org/Reference.pdf. See section 5.8.3.24 for parameters + /// The two for loops can share the same __kmpc_static_init4() and __kmpc_static_fini() + /// calls. + + bool deleteStaticScheduleCalls() { + bool Changed = false; + // if there is no kmpc_for_static_init_4, there is no need to do anything + RuntimeFunctionInfo &RFI = RFIs[OMPRTL___kmpc_for_static_init_4]; + if (!RFI.Declaration) + return Changed; + // Else go through each function + OMPLoopFusion OLF; + for (Function *F : SCC) + Changed = runOverTheBlock(*F, &OLF); + return Changed; + } + +// Check the compatility of the of the __kmpc_for_static_init_4 + void checkTheCompatibility(OMPLoopFusion *OLF){ + bool compatible = true; + for (auto itr : OLF->call_init_fini_mapping) { + if (find(itr.first, OLF->call_map)) continue; + std::vector v; + std::vector v1; + for (Value *arg : (itr.first)->args()) + v1.push_back(arg); + for (auto itr1 : OLF->call_init_fini_mapping) { + if ((itr.first) == (itr1.first)) continue; + if (find(itr1.first, OLF->call_map)) continue; + std::vector v2; + for (Value *arg2 : (itr1.first)->args()) + v2.push_back(arg2); + for (auto i = v1.begin(), j = v2.begin(); i != v1.end() && j != v2.end(); ++i, ++j) { + if (isa(*i) && isa(*j)) { + if (*i != *j) {compatible = false; break;} + } + else { + if (OLF->store_op0_op1.find(*j)->second != OLF->store_op0_op1.find(*i)->second) { + compatible = false; break;} + } + } + if (compatible) { + for (auto i = v1.begin(), j = v2.begin(); i != v1.end() && j != v2.end(); ++i, ++j) { + OLF->args_map.insert({*j,*i}); + } + v.push_back(itr1.first); + } + else break; /// the adjacent for omp loop is not compatible so there is no need to check others + /// therefore we need to break out of the second for loop + } + /// if a call instruction has some compatible call instructions then put in the call_map container + OLF->call_map.insert({itr.first, v}); + /// make the flag true again for the next instruction checking + if (!compatible) compatible = true; + v.clear(); + } + } + + bool checkForOMPInit(BasicBlock* B){ + if (!B) return false; + for (BasicBlock::iterator BBI=B->begin(); BBI !=B->end(); ++BBI){ + if (CallInst *c= dyn_cast(BBI)){ + if (c->getCalledFunction()->getName()=="__kmpc_for_static_init_4"){ + return true;} + } + } + return false; + } + + bool checkForOMPFini(BasicBlock* B){ + if (!B) return false; + for (BasicBlock::iterator BBI=B->begin(); BBI !=B->end(); ++BBI){ + if (CallInst *c= dyn_cast(BBI)){ + if (c->getCalledFunction()->getName()=="__kmpc_for_static_fini"){ + return true;} + } + } + return false; + } + + void markNodeVisited(BasicBlock* B,std::vector &v,OMPLoopFusion *OLF){ + if (!B) return; + OLF->visited.push_back(B); + v.push_back(B); + for ( auto BB: successors(B)){ + if (find(OLF->visited,BB)) continue; + markNodeVisited(BB,v, OLF); + } + } + + BasicBlock* checkTheLoop(BasicBlock* B,std::vector &v, OMPLoopFusion *OLF){ + std::vector v2; + for (auto S: successors(B)){ + if (checkLoop(S, B, v2)) { + // mark all the node as visited + markNodeVisited(S,v,OLF); + return nullptr;} + else + return S; + } + return nullptr; + } + + bool checkLoop(BasicBlock* S, BasicBlock* B, std::vector& visit){ + bool loop = false; + if (!S) return loop; + for (auto BB: successors(S)){ + if (BB == B) {loop = true; break;} + if (find(visit, BB)) continue; + visit.push_back(BB); + loop = (loop || checkLoop (BB, B, visit)); + } + return loop; + } + + + int countSuccessors(BasicBlock* B){ + int count = 0; + for (auto BS: successors(B)) // I should use iterator instead + count++; + return count; + } + int countPredessors(BasicBlock* B){ + int count = 0; + for (auto BP: predecessors(B)) + count++; + return count; + } + void makeLineage(BasicBlock *B, std::vector &v, OMPLoopFusion *OLF){ + if (!B or find(OLF->visited, B) ) return; + if ((countSuccessors(B) <=1 ) && (countPredessors(B) > 1)) return; // unique entrance with two control flows + if ((countPredessors(B) <=1 ) && (countSuccessors(B)) > 1) return; // two control flows merging into a unique point + // these points can not be part of lineage for the optimizations + BasicBlock* t=nullptr; + // If you have a basic blokc try to find the omp for starting point + if (B->getSingleSuccessor()){ + OLF->visited.push_back(B); + v.push_back(B); + if (checkForOMPInit(B)) // if you find it then find the end points ; all inbetween points are are part of the lineage + t=checkOMPForLoop(B->getSingleSuccessor(), v, OLF);// the output is the basicblock for building the lineage + else + t=B->getSingleSuccessor();}// else take the successor and move on + else {// if you have a codition with more than two successors and predecessors + // we need to check if they are control points or inbetween for loops + OLF->visited.push_back(B); + t = checkTheLoop(B, v, OLF) ; + v.push_back(B); + } + makeLineage(t, v, OLF); + return; + } + + + BasicBlock* checkOMPForLoop(BasicBlock *BB,std::vector &v, OMPLoopFusion *OLF){ + BasicBlock * t = nullptr; + if (!BB) return t; + OLF->visited.push_back(BB); + v.push_back(BB); + for (auto B: successors(BB)){ + if (find(OLF->visited, B)) continue; + if (checkForOMPFini(B)) { t= B; continue;} + checkOMPForLoop (B, v, OLF); + } + return t; + } + + + bool find(std::vector b, BasicBlock* B){ + for ( auto t: b) + if (t == B) return true; + return false; + } + + bool find(CallInst *I, std::map> m) { + for (auto itr :m){ + if (itr.first== I) return true; + for (auto itr1 : (itr.second)) + if (I == itr1) return true; + } + return false; + } + + void clean_intrinsic_calls(BasicBlock* B, OMPLoopFusion *OLF){ + std::vector remove; + for (BasicBlock::iterator DI = B->begin(); DI != B->end(); ++DI ) { + if (IntrinsicInst *II = dyn_cast (DI)){ + if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end ){ + remove.push_back(II); + } + } + } + for (auto r: remove) + r->eraseFromParent(); + } + + void check_call_instructions(BasicBlock* B, OMPLoopFusion *OLF){ + for (BasicBlock::iterator DI = B->begin(); DI != B->end(); ++DI ) { + if (CallInst *c = dyn_cast(DI)) { + if (c->getCalledFunction()->getName() == "__kmpc_for_static_init_4") + OLF->current_call_init_instruction = c; + if (c->getCalledFunction()->getName() == "__kmpc_for_static_fini") + OLF->call_init_fini_mapping.insert({OLF->current_call_init_instruction, c}); + } + if (StoreInst *store = dyn_cast(DI)) + OLF->store_op0_op1.insert({store->getOperand(1), store->getOperand(0)}); + } + } + + bool runOverTheBlock(Function &F, OMPLoopFusion *OLF) { + std::vector v; + bool changed = false; + for (auto &BB: F) { + // on each block prepare data structure for the instructions + if (find (OLF->visited, &BB)) continue; + makeLineage (&BB, v, OLF); + OLF->chain.insert({&BB,v}); + v.clear(); + } + changed = doTheOptimization(OLF);// act on the formed lineages + + return changed; + } + + bool doTheOptimization(OMPLoopFusion *OLF){ + bool changed = false; + for (auto S: OLF->chain){ + //we have todo it for each lineage + //B is a basic block in a lineage + for ( auto B:S.second){ + check_call_instructions(B, OLF); + } + checkTheCompatibility(OLF); + changed = cleanInstructions(OLF); + if (changed) + for (auto B:S.second){ + replace_UseValues(B, OLF); + clean_intrinsic_calls(B, OLF); + } + OLF->call_init_fini_mapping.clear(); + OLF->call_map.clear(); + OLF->store_op0_op1.clear(); + OLF->args_map.clear(); + + } + return changed; + } + + void replace_UseValues(BasicBlock* B, OMPLoopFusion *OLF){ + std::vector remove; + for (BasicBlock::iterator II = B->begin(); II != B->end(); ++II) { + Instruction *It = dyn_cast(II); + if (isa(It)) continue; + for (unsigned int k = 0; k < It->getNumOperands(); k++){ + auto temp = OLF->args_map.find(It->getOperand(k)); + if (temp != OLF->args_map.end()){ + It->setOperand(k, temp->second); + if (isa(It) && k > 0) remove.push_back(It); + } + } + } + for (auto r: remove) + r->eraseFromParent(); + } + + bool cleanInstructions(OMPLoopFusion *OLF) { + bool changed = false; + for (auto itr : OLF->call_map) { + int count = (itr.second).size(); + if (!count) continue; + Instruction *I = OLF->call_init_fini_mapping.find(itr.first)->second; + I->eraseFromParent(); + changed = true; + for (auto itr1:itr.second) { + Instruction *I1 = itr1; + Instruction *I2 = OLF->call_init_fini_mapping.find(itr1)->second; + I1->eraseFromParent(); + if (count == 1) break; + I2->eraseFromParent(); + count--; + } + } + return changed; + } + + + /// Try to delete parallel regions if possible bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp b/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_for_loop_merging.cpp @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c -std=c99 -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics + +void test_1(){ + +#pragma omp parallel +{ + #pragma omp for + for (int i=0; i < 100; i++) + ; + #pragma omp for + for (int j=0; j < 10; j++) + ; + #pragma omp for + for (int i=0; i < 10; i++) + ; + #pragma omp for + for (int i=0; i < 10; i++) + ; +} + // The first parallel for loop will not be merged + // The last three parallel for loops will be merged +} + + +// CHECK: define void @test_1() +// CHECK: ...) @__kmpc_for_call( +// CHECK: ret void +// CHECK: define internal void @.omp_outlined.( +// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call void @__kmpc_for_static_fini( +// CHECK-NEXT: call void @__kmpc_barrier( +// CHECK call void @__kmpc_for_static_init_4( +// CHECK call void @__kmpc_barrier( +// CHECK call void @__kmpc_barrier( +// CHECK call void @__kmpc_for_static_fini( +// CHECK-NEXT call void @__kmpc_barrier( +// CHECK : ret void diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -94,6 +94,7 @@ static kmp_int8 warn = 0; + if (ompt_enabled.ompt_callback_work) { // Only fully initialize variables needed by OMPT if OMPT is enabled. team_info = __ompt_get_teaminfo(0, NULL);