diff --git a/.arcconfig b/.arcconfig --- a/.arcconfig +++ b/.arcconfig @@ -1,5 +1,5 @@ { - "phabricator.uri" : "https://reviews.llvm.org/", + "repository.callsign" : "G", "conduit_uri" : "https://reviews.llvm.org/" } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -194,6 +194,8 @@ __OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) +__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, Int32, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32 ) +__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32) __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/OpenMPOpt.h" - +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" @@ -21,10 +21,12 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/CFG.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include using namespace llvm; using namespace omp; @@ -60,7 +62,6 @@ initializeRuntimeFunctions(); OMPBuilder.initialize(); } - /// Generic information that describes a runtime function struct RuntimeFunctionInfo { /// The kind, as described by the RuntimeFunction enum. @@ -107,19 +108,134 @@ /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. bool run() { bool Changed = false; - LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() << " functions in a slice with " << ModuleSlice.size() << " functions\n"); Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= deleteStaticScheduleCalls(); return Changed; } private: - /// Try to delete parallel regions if possible + /// Combine "OpenMP for loop with static scheduling" + /// TODO: Conditional merging of static scheduler + bool deleteStaticScheduleCalls() { + /// Data set for storing relative information + SmallVector callInst; + SmallVector basicblock; + MapVector store_op0_op1, args_map; + MapVector> mapper; + MapVector MapInitFinCI; + + /// It takes a call instructions and extract arguments that needs to be compared for merging + auto BuildArguments = [&] (SmallVector& A, CallInst& I){ + A.clear(); + for (Value* v: I.args()) + A.push_back(v); + }; + /// It takes two call instructions and compares for the merging + auto CompareTwoInstArgs = [&] (SmallVector& A1, SmallVector& A2){ + for (auto i = A1.begin(), j = A2.begin(); i != A1.end() && j!= A2.end(); i++, j++){ + if (isa(*i) && isa(*j)){ + if ( *i != *j) return false;} + else if (store_op0_op1.find(*i)->second != store_op0_op1.find(*j)->second) + return false;} + for (auto i = A1.begin(), j = A2.begin(); i != A1.end() && j!= A2.end(); i++, j++) + args_map.insert({*j, *i}); + return true; + }; + /// Prepare information which two call instructions are compatible + /// comparing static_fini_4 call instruction for merging + auto CheckTheCompatibility = [&](){ + SmallVector Args1, Args2; + SmallVector CompInst; + for (auto i= callInst.begin(); i != callInst.end(); ++i){ + CompInst.clear(); + BuildArguments(Args1, **i); + for ( auto j = i+1; j != callInst.end(); ++j){ + BuildArguments (Args2, **j); + if (CompareTwoInstArgs(Args1, Args2)) { + CompInst.push_back(*j); + callInst.erase(j);j--; + } else { + if (CompInst.size()) mapper.insert({*i, CompInst}); + break;} + } + if (i==callInst.end()-1 && CompInst.size()) mapper.insert({*i, CompInst}); + } + return mapper.size(); + }; + /// Clean the redundent call instructions after merging + auto CleanInstructions=[&](){ + for ( auto itr :mapper){ + int count = (itr.second).size(); + Instruction *I = MapInitFinCI.find(itr.first)->second; + I->eraseFromParent(); + for (auto itr1 : itr.second){ + Instruction *I1 = itr1; + Instruction *I2 = MapInitFinCI.find(itr1)->second; + I1->eraseFromParent(); + if ( count==1) break; + I2->eraseFromParent(); + count--;} + } + }; + /// Replace the redundent register values with the relevent or alive register values after merging and cleaning + auto Replace_UseValues = [&](){ + SmallVector removeInst, removeIInst; + for ( auto b: basicblock) + for (BasicBlock::iterator II = b->begin(); II != b->end(); ++II) { + if (IntrinsicInst *I = dyn_cast (II)){ + if (I->getIntrinsicID() == Intrinsic::lifetime_start || I->getIntrinsicID() == Intrinsic::lifetime_end ) + removeIInst.push_back(I);continue;} + Instruction *It = dyn_cast(II); + if (isa(It)) continue; + for (unsigned int k = 0; k < It->getNumOperands(); k++){ + auto temp = args_map.find(It->getOperand(k)); + if (temp != args_map.end()){ + It->setOperand(k, temp->second); + if (isa(It) && k > 0) removeInst.push_back(It); + } + } + } + for (auto r: removeInst) + r->eraseFromParent(); + for (auto r: removeIInst) + r->eraseFromParent(); + }; + /// Iterate over the all the functions + for ( Function *F : SCC){ + basicblock.clear(); + for (auto &B: *F){ + basicblock.push_back(&B); + CallInst* last; + for (BasicBlock::iterator DI=B.begin(); DI != B.end(); ++DI){ + if (CallInst *c = dyn_cast(DI)) + { + if (c->getCalledFunction()->getName() == "__kmpc_for_static_init_4"){ + callInst.push_back(c); + last = c;} + else if (c->getCalledFunction()->getName() == "__kmpc_for_static_fini"){ + MapInitFinCI.insert({last, c});} + } + if (StoreInst *store = dyn_cast(DI)) + store_op0_op1.insert({store->getOperand(1),store->getOperand(0)}); + } + } + } + /// Return TRUE if there IR has been modified + if (CheckTheCompatibility()){ + CleanInstructions(); + Replace_UseValues(); + return true; + } + + return false; + } + /// Try to delete parallel regions if possible bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/parallel_omp_for_loop_merge1.ll b/llvm/test/Transforms/OpenMP/parallel_omp_for_loop_merge1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_omp_for_loop_merge1.ll @@ -0,0 +1,324 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; RUN: opt -S -openmpopt < %s | FileCheck %s +; The IR was produced using -O0 and later applying mem2reg, simplifycfg, and instrcombine passes +;int main(){ +; The last three loops should merged +;#pragma omp parallel +;{ +; #pragma omp for +; for (int i=0; i < 100; i++) +; ; +; #pragma omp for +; for (int j=0; j < 10; j++) +; ; +; #pragma omp for +; for (int i=0; i < 10; i++) +; ; +; #pragma omp for +; for (int i=0; i < 10; i++) +; ; +;} +; return 0; +;} +; + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.15.0" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 + +; Function Attrs: noinline nounwind optnone ssp uwtable +define i32 @main() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)) + ret i32 0 +} + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined.(i32* noalias %0, i32* noalias %1) #1 { + %3 = alloca i32*, align 8 + %4 = alloca i32*, align 8 + %5 = alloca i32, align 4 + %6 = alloca i32, align 4 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + %21 = alloca i32, align 4 + %22 = alloca i32, align 4 + %23 = alloca i32, align 4 + %24 = alloca i32, align 4 + %25 = alloca i32, align 4 + %26 = alloca i32, align 4 + %27 = alloca i32, align 4 + %28 = alloca i32, align 4 + %29 = alloca i32, align 4 + %30 = alloca i32, align 4 + %31 = alloca i32, align 4 + %32 = alloca i32, align 4 + store i32* %0, i32** %3, align 8 + store i32* %1, i32** %4, align 8 + store i32 0, i32* %7, align 4 + store i32 99, i32* %8, align 4 + store i32 1, i32* %9, align 4 + store i32 0, i32* %10, align 4 + %33 = load i32*, i32** %3, align 8 + %34 = load i32, i32* %33, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %10, i32* %7, i32* %8, i32* %9, i32 1, i32 1) + %35 = load i32, i32* %8, align 4 + %36 = icmp sgt i32 %35, 99 + br i1 %36, label %37, label %38 + +37: ; preds = %2 + br label %40 + +38: ; preds = %2 + %39 = load i32, i32* %8, align 4 + br label %40 + +40: ; preds = %38, %37 + %41 = phi i32 [ 99, %37 ], [ %39, %38 ] + store i32 %41, i32* %8, align 4 + %42 = load i32, i32* %7, align 4 + store i32 %42, i32* %5, align 4 + br label %43 + +43: ; preds = %52, %40 + %44 = load i32, i32* %5, align 4 + %45 = load i32, i32* %8, align 4 + %46 = icmp sle i32 %44, %45 + br i1 %46, label %47, label %55 + +47: ; preds = %43 + %48 = load i32, i32* %5, align 4 + %49 = mul nsw i32 %48, 1 + %50 = add nsw i32 0, %49 + store i32 %50, i32* %11, align 4 + br label %51 + +51: ; preds = %47 + br label %52 + +52: ; preds = %51 + %53 = load i32, i32* %5, align 4 + %54 = add nsw i32 %53, 1 + store i32 %54, i32* %5, align 4 + br label %43 + +55: ; preds = %43 + br label %56 + +56: ; preds = %55 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + store i32 0, i32* %14, align 4 + store i32 9, i32* %15, align 4 + store i32 1, i32* %16, align 4 + store i32 0, i32* %17, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %17, i32* %14, i32* %15, i32* %16, i32 1, i32 1) + %57 = load i32, i32* %15, align 4 + %58 = icmp sgt i32 %57, 9 + br i1 %58, label %59, label %60 + +59: ; preds = %56 + br label %62 + +60: ; preds = %56 + %61 = load i32, i32* %15, align 4 + br label %62 + +62: ; preds = %60, %59 + %63 = phi i32 [ 9, %59 ], [ %61, %60 ] + store i32 %63, i32* %15, align 4 + %64 = load i32, i32* %14, align 4 + store i32 %64, i32* %12, align 4 + br label %65 + +65: ; preds = %74, %62 + %66 = load i32, i32* %12, align 4 + %67 = load i32, i32* %15, align 4 + %68 = icmp sle i32 %66, %67 + br i1 %68, label %69, label %77 + +69: ; preds = %65 + %70 = load i32, i32* %12, align 4 + %71 = mul nsw i32 %70, 1 + %72 = add nsw i32 0, %71 + store i32 %72, i32* %18, align 4 + br label %73 + +73: ; preds = %69 + br label %74 + +74: ; preds = %73 + %75 = load i32, i32* %12, align 4 + %76 = add nsw i32 %75, 1 + store i32 %76, i32* %12, align 4 + br label %65 + +77: ; preds = %65 + br label %78 + +78: ; preds = %77 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + store i32 0, i32* %21, align 4 + store i32 9, i32* %22, align 4 + store i32 1, i32* %23, align 4 + store i32 0, i32* %24, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %24, i32* %21, i32* %22, i32* %23, i32 1, i32 1) + %79 = load i32, i32* %22, align 4 + %80 = icmp sgt i32 %79, 9 + br i1 %80, label %81, label %82 + +81: ; preds = %78 + br label %84 + +82: ; preds = %78 + %83 = load i32, i32* %22, align 4 + br label %84 + +84: ; preds = %82, %81 + %85 = phi i32 [ 9, %81 ], [ %83, %82 ] + store i32 %85, i32* %22, align 4 + %86 = load i32, i32* %21, align 4 + store i32 %86, i32* %19, align 4 + br label %87 + +87: ; preds = %96, %84 + %88 = load i32, i32* %19, align 4 + %89 = load i32, i32* %22, align 4 + %90 = icmp sle i32 %88, %89 + br i1 %90, label %91, label %99 + +91: ; preds = %87 + %92 = load i32, i32* %19, align 4 + %93 = mul nsw i32 %92, 1 + %94 = add nsw i32 0, %93 + store i32 %94, i32* %25, align 4 + br label %95 + +95: ; preds = %91 + br label %96 + +96: ; preds = %95 + %97 = load i32, i32* %19, align 4 + %98 = add nsw i32 %97, 1 + store i32 %98, i32* %19, align 4 + br label %87 + +99: ; preds = %87 + br label %100 + +100: ; preds = %99 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + store i32 0, i32* %28, align 4 + store i32 9, i32* %29, align 4 + store i32 1, i32* %30, align 4 + store i32 0, i32* %31, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %31, i32* %28, i32* %29, i32* %30, i32 1, i32 1) + %101 = load i32, i32* %29, align 4 + %102 = icmp sgt i32 %101, 9 + br i1 %102, label %103, label %104 + +103: ; preds = %100 + br label %106 + +104: ; preds = %100 + %105 = load i32, i32* %29, align 4 + br label %106 + +106: ; preds = %104, %103 + %107 = phi i32 [ 9, %103 ], [ %105, %104 ] + store i32 %107, i32* %29, align 4 + %108 = load i32, i32* %28, align 4 + store i32 %108, i32* %26, align 4 + br label %109 + +109: ; preds = %118, %106 + %110 = load i32, i32* %26, align 4 + %111 = load i32, i32* %29, align 4 + %112 = icmp sle i32 %110, %111 + br i1 %112, label %113, label %121 + +113: ; preds = %109 + %114 = load i32, i32* %26, align 4 + %115 = mul nsw i32 %114, 1 + %116 = add nsw i32 0, %115 + store i32 %116, i32* %32, align 4 + br label %117 + +117: ; preds = %113 + br label %118 + +118: ; preds = %117 + %119 = load i32, i32* %26, align 4 + %120 = add nsw i32 %119, 1 + store i32 %120, i32* %26, align 4 + br label %109 + +121: ; preds = %109 + br label %122 + +122: ; preds = %121 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: nounwind +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) #2 + +; Function Attrs: convergent nounwind +declare void @__kmpc_barrier(%struct.ident_t*, i32) #3 + +; Function Attrs: nounwind +declare !callback !4 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #2 + + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 15]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"PIC Level", i32 2} +!3 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 979bcbd3a6f7ea784f2098ad4cf613fbd6b09e38)"} +!4 = !{!5} +!5 = !{i64 2, i64 -1, i64 -1, i1 true} + +; +; CHECK: call void @__kmpc_for_static_init_4( +; CHECK: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK: call void @__kmpc_for_static_init_4( +; CHECK-NOT: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK-NOT: call void @__kmpc_for_static_init_4( +; CHECK-NOT: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK-NOT: call void @__kmpc_for_static_init_4( +; CHECK: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK: ret void +; + + diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -94,6 +94,7 @@ static kmp_int8 warn = 0; + if (ompt_enabled.ompt_callback_work) { // Only fully initialize variables needed by OMPT if OMPT is enabled. team_info = __ompt_get_teaminfo(0, NULL);