diff --git a/.arcconfig b/.arcconfig --- a/.arcconfig +++ b/.arcconfig @@ -1,5 +1,5 @@ { - "phabricator.uri" : "https://reviews.llvm.org/", + "repository.callsign" : "G", "conduit_uri" : "https://reviews.llvm.org/" } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -194,6 +194,8 @@ __OMP_RTL(__kmpc_push_proc_bind, false, Void, IdentPtr, Int32, /* Int */ Int32) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) +__OMP_RTL(__kmpc_for_static_init_4, false, Void, IdentPtr, Int32, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32 ) +__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32) __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/OpenMPOpt.h" - +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" @@ -21,10 +21,12 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/CFG.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include using namespace llvm; using namespace omp; @@ -60,7 +62,6 @@ initializeRuntimeFunctions(); OMPBuilder.initialize(); } - /// Generic information that describes a runtime function struct RuntimeFunctionInfo { /// The kind, as described by the RuntimeFunction enum. @@ -107,19 +108,134 @@ /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. bool run() { bool Changed = false; - LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() << " functions in a slice with " << ModuleSlice.size() << " functions\n"); Changed |= deduplicateRuntimeCalls(); Changed |= deleteParallelRegions(); + Changed |= deleteStaticScheduleCalls(); return Changed; } private: - /// Try to delete parallel regions if possible + /// Combine "OpenMP for loop with static scheduling" + /// TODO: Conditional merging of static scheduler + bool deleteStaticScheduleCalls() { + /// Data set for storing relative information + SmallVector callInst; + SmallVector basicblock; + MapVector store_op0_op1, args_map; + MapVector> mapper; + MapVector MapInitFinCI; + + /// It takes a call instructions and extract arguments that needs to be compared for merging + auto BuildArguments = [&] (SmallVector& A, CallInst& I){ + A.clear(); + for (Value* v: I.args()) + A.push_back(v); + }; + /// It takes two call instructions and compares for the merging + auto CompareTwoInstArgs = [&] (SmallVector& A1, SmallVector& A2){ + for (auto i = A1.begin(), j = A2.begin(); i != A1.end() && j!= A2.end(); i++, j++){ + if (isa(*i) && isa(*j)){ + if ( *i != *j) return false;} + else if (store_op0_op1.find(*i)->second != store_op0_op1.find(*j)->second) + return false;} + for (auto i = A1.begin(), j = A2.begin(); i != A1.end() && j!= A2.end(); i++, j++) + args_map.insert({*j, *i}); + return true; + }; + /// Prepare information which two call instructions are compatible + /// comparing static_fini_4 call instruction for merging + auto CheckTheCompatibility = [&](){ + SmallVector Args1, Args2; + SmallVector CompInst; + for (auto i= callInst.begin(); i != callInst.end(); ++i){ + CompInst.clear(); + BuildArguments(Args1, **i); + for ( auto j = i+1; j != callInst.end(); ++j){ + BuildArguments (Args2, **j); + if (CompareTwoInstArgs(Args1, Args2)) { + CompInst.push_back(*j); + callInst.erase(j);j--; + } else { + if (CompInst.size()) mapper.insert({*i, CompInst}); + break;} + } + if (i==callInst.end()-1 && CompInst.size()) mapper.insert({*i, CompInst}); + } + return mapper.size(); + }; + /// Clean the redundent call instructions after merging + auto CleanInstructions=[&](){ + for ( auto itr :mapper){ + int count = (itr.second).size(); + Instruction *I = MapInitFinCI.find(itr.first)->second; + I->eraseFromParent(); + for (auto itr1 : itr.second){ + Instruction *I1 = itr1; + Instruction *I2 = MapInitFinCI.find(itr1)->second; + I1->eraseFromParent(); + if ( count==1) break; + I2->eraseFromParent(); + count--;} + } + }; + /// Replace the redundent register values with the relevent or alive register values after merging and cleaning + auto Replace_UseValues = [&](){ + SmallVector removeInst, removeIInst; + for ( auto b: basicblock) + for (BasicBlock::iterator II = b->begin(); II != b->end(); ++II) { + if (IntrinsicInst *I = dyn_cast (II)){ + if (I->getIntrinsicID() == Intrinsic::lifetime_start || I->getIntrinsicID() == Intrinsic::lifetime_end ) + removeIInst.push_back(I);continue;} + Instruction *It = dyn_cast(II); + if (isa(It)) continue; + for (unsigned int k = 0; k < It->getNumOperands(); k++){ + auto temp = args_map.find(It->getOperand(k)); + if (temp != args_map.end()){ + It->setOperand(k, temp->second); + if (isa(It) && k > 0) removeInst.push_back(It); + } + } + } + for (auto r: removeInst) + r->eraseFromParent(); + for (auto r: removeIInst) + r->eraseFromParent(); + }; + /// Iterate over the all the functions + for ( Function *F : SCC){ + basicblock.clear(); + for (auto &B: *F){ + basicblock.push_back(&B); + CallInst* last; + for (BasicBlock::iterator DI=B.begin(); DI != B.end(); ++DI){ + if (CallInst *c = dyn_cast(DI)) + { + if (c->getCalledFunction()->getName() == "__kmpc_for_static_init_4"){ + callInst.push_back(c); + last = c;} + else if (c->getCalledFunction()->getName() == "__kmpc_for_static_fini"){ + MapInitFinCI.insert({last, c});} + } + if (StoreInst *store = dyn_cast(DI)) + store_op0_op1.insert({store->getOperand(1),store->getOperand(0)}); + } + } + } + /// Return TRUE if there IR has been modified + if (CheckTheCompatibility()){ + CleanInstructions(); + Replace_UseValues(); + return true; + } + + return false; + } + /// Try to delete parallel regions if possible bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll b/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallelMergeForLoop.ll @@ -0,0 +1,909 @@ +; The IR is produced using << -fopenmp -emit-llvm -S -c parallelMergeForLoop.c -o parallelMergeForLoop.ll >> flags +; RUN: opt -S -attributor -openmpopt -O3 < %s | FileCheck %s +; ModuleID = 'parallelMergeForLoop.c' +source_filename = "parallelMergeForLoop.c" +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.15.0" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@.str = private unnamed_addr constant [3 x i8] c"%d\00", align 1 + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_all() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %1) + ret void +} +;;;;; Test-1 +;void merge_all(){ +; int a=0; +; #pragma omp parallel +; { +; #pragma omp for +; for (int i=0; i < 100; i++) +; a=i; +; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; } +;} +;;; Both the loops should be merged + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined.(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 { + %4 = alloca i32*, align 8 + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + store i32* %0, i32** %4, align 8 + store i32* %1, i32** %5, align 8 + store i32* %2, i32** %6, align 8 + %21 = load i32*, i32** %6, align 8 + store i32 0, i32* %9, align 4 + store i32 99, i32* %10, align 4 + store i32 1, i32* %11, align 4 + store i32 0, i32* %12, align 4 + %22 = load i32*, i32** %4, align 8 + %23 = load i32, i32* %22, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1) + %24 = load i32, i32* %10, align 4 + %25 = icmp sgt i32 %24, 99 + br i1 %25, label %26, label %27 + +26: ; preds = %3 + br label %29 + +27: ; preds = %3 + %28 = load i32, i32* %10, align 4 + br label %29 + +29: ; preds = %27, %26 + %30 = phi i32 [ 99, %26 ], [ %28, %27 ] + store i32 %30, i32* %10, align 4 + %31 = load i32, i32* %9, align 4 + store i32 %31, i32* %7, align 4 + br label %32 + +32: ; preds = %42, %29 + %33 = load i32, i32* %7, align 4 + %34 = load i32, i32* %10, align 4 + %35 = icmp sle i32 %33, %34 + br i1 %35, label %36, label %45 + +36: ; preds = %32 + %37 = load i32, i32* %7, align 4 + %38 = mul nsw i32 %37, 1 + %39 = add nsw i32 0, %38 + store i32 %39, i32* %13, align 4 + %40 = load i32, i32* %13, align 4 + store i32 %40, i32* %21, align 4 + br label %41 + +41: ; preds = %36 + br label %42 + +42: ; preds = %41 + %43 = load i32, i32* %7, align 4 + %44 = add nsw i32 %43, 1 + store i32 %44, i32* %7, align 4 + br label %32 + +45: ; preds = %32 + br label %46 + +46: ; preds = %45 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + store i32 0, i32* %16, align 4 + store i32 99, i32* %17, align 4 + store i32 1, i32* %18, align 4 + store i32 0, i32* %19, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1) + %47 = load i32, i32* %17, align 4 + %48 = icmp sgt i32 %47, 99 + br i1 %48, label %49, label %50 + +49: ; preds = %46 + br label %52 + +50: ; preds = %46 + %51 = load i32, i32* %17, align 4 + br label %52 + +52: ; preds = %50, %49 + %53 = phi i32 [ 99, %49 ], [ %51, %50 ] + store i32 %53, i32* %17, align 4 + %54 = load i32, i32* %16, align 4 + store i32 %54, i32* %14, align 4 + br label %55 + +55: ; preds = %65, %52 + %56 = load i32, i32* %14, align 4 + %57 = load i32, i32* %17, align 4 + %58 = icmp sle i32 %56, %57 + br i1 %58, label %59, label %68 + +59: ; preds = %55 + %60 = load i32, i32* %14, align 4 + %61 = mul nsw i32 %60, 1 + %62 = add nsw i32 0, %61 + store i32 %62, i32* %20, align 4 + %63 = load i32, i32* %20, align 4 + store i32 %63, i32* %21, align 4 + br label %64 + +64: ; preds = %59 + br label %65 + +65: ; preds = %64 + %66 = load i32, i32* %14, align 4 + %67 = add nsw i32 %66, 1 + store i32 %67, i32* %14, align 4 + br label %55 + +68: ; preds = %55 + br label %69 + +69: ; preds = %68 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: nounwind +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) #2 + +; Function Attrs: convergent nounwind +declare void @__kmpc_barrier(%struct.ident_t*, i32) #3 + +; Function Attrs: nounwind +declare !callback !4 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #2 + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_none() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* %1) + ret void +} + +;;;;; Test-2 +;void merge_none(){ +; int a=0; +; #pragma omp parallel +; { +; #pragma omp for +; for (int i=1; i < 100; i++) +; a=i; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; } +;} +;;; The two OMP for loops should not be merged + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined..1(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 { + %4 = alloca i32*, align 8 + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + store i32* %0, i32** %4, align 8 + store i32* %1, i32** %5, align 8 + store i32* %2, i32** %6, align 8 + %21 = load i32*, i32** %6, align 8 + store i32 0, i32* %9, align 4 + store i32 98, i32* %10, align 4 + store i32 1, i32* %11, align 4 + store i32 0, i32* %12, align 4 + %22 = load i32*, i32** %4, align 8 + %23 = load i32, i32* %22, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1) + %24 = load i32, i32* %10, align 4 + %25 = icmp sgt i32 %24, 98 + br i1 %25, label %26, label %27 + +26: ; preds = %3 + br label %29 + +27: ; preds = %3 + %28 = load i32, i32* %10, align 4 + br label %29 + +29: ; preds = %27, %26 + %30 = phi i32 [ 98, %26 ], [ %28, %27 ] + store i32 %30, i32* %10, align 4 + %31 = load i32, i32* %9, align 4 + store i32 %31, i32* %7, align 4 + br label %32 + +32: ; preds = %42, %29 + %33 = load i32, i32* %7, align 4 + %34 = load i32, i32* %10, align 4 + %35 = icmp sle i32 %33, %34 + br i1 %35, label %36, label %45 + +36: ; preds = %32 + %37 = load i32, i32* %7, align 4 + %38 = mul nsw i32 %37, 1 + %39 = add nsw i32 1, %38 + store i32 %39, i32* %13, align 4 + %40 = load i32, i32* %13, align 4 + store i32 %40, i32* %21, align 4 + br label %41 + +41: ; preds = %36 + br label %42 + +42: ; preds = %41 + %43 = load i32, i32* %7, align 4 + %44 = add nsw i32 %43, 1 + store i32 %44, i32* %7, align 4 + br label %32 + +45: ; preds = %32 + br label %46 + +46: ; preds = %45 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + store i32 0, i32* %16, align 4 + store i32 99, i32* %17, align 4 + store i32 1, i32* %18, align 4 + store i32 0, i32* %19, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %23, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1) + %47 = load i32, i32* %17, align 4 + %48 = icmp sgt i32 %47, 99 + br i1 %48, label %49, label %50 + +49: ; preds = %46 + br label %52 + +50: ; preds = %46 + %51 = load i32, i32* %17, align 4 + br label %52 + +52: ; preds = %50, %49 + %53 = phi i32 [ 99, %49 ], [ %51, %50 ] + store i32 %53, i32* %17, align 4 + %54 = load i32, i32* %16, align 4 + store i32 %54, i32* %14, align 4 + br label %55 + +55: ; preds = %65, %52 + %56 = load i32, i32* %14, align 4 + %57 = load i32, i32* %17, align 4 + %58 = icmp sle i32 %56, %57 + br i1 %58, label %59, label %68 + +59: ; preds = %55 + %60 = load i32, i32* %14, align 4 + %61 = mul nsw i32 %60, 1 + %62 = add nsw i32 0, %61 + store i32 %62, i32* %20, align 4 + %63 = load i32, i32* %20, align 4 + store i32 %63, i32* %21, align 4 + br label %64 + +64: ; preds = %59 + br label %65 + +65: ; preds = %64 + %66 = load i32, i32* %14, align 4 + %67 = add nsw i32 %66, 1 + store i32 %67, i32* %14, align 4 + br label %55 + +68: ; preds = %55 + br label %69 + +69: ; preds = %68 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %23) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %23) + ret void +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_some() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* %1) + ret void +} + +;;;;; Test-3 +;void merge_some(){ +; int a = 0; +; #pragma omp parallel +; { +; #pragma omp for +; for (int i=1; i < 100; i++) +; a=i; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; #pragma omp for +; for (int k=0; k < 100; k++) +; a=k; +; } +;} +;;; The last two OMP for loops should be merged + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined..2(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2) #1 { + %4 = alloca i32*, align 8 + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + %21 = alloca i32, align 4 + %22 = alloca i32, align 4 + %23 = alloca i32, align 4 + %24 = alloca i32, align 4 + %25 = alloca i32, align 4 + %26 = alloca i32, align 4 + %27 = alloca i32, align 4 + store i32* %0, i32** %4, align 8 + store i32* %1, i32** %5, align 8 + store i32* %2, i32** %6, align 8 + %28 = load i32*, i32** %6, align 8 + store i32 0, i32* %9, align 4 + store i32 98, i32* %10, align 4 + store i32 1, i32* %11, align 4 + store i32 0, i32* %12, align 4 + %29 = load i32*, i32** %4, align 8 + %30 = load i32, i32* %29, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %12, i32* %9, i32* %10, i32* %11, i32 1, i32 1) + %31 = load i32, i32* %10, align 4 + %32 = icmp sgt i32 %31, 98 + br i1 %32, label %33, label %34 + +33: ; preds = %3 + br label %36 + +34: ; preds = %3 + %35 = load i32, i32* %10, align 4 + br label %36 + +36: ; preds = %34, %33 + %37 = phi i32 [ 98, %33 ], [ %35, %34 ] + store i32 %37, i32* %10, align 4 + %38 = load i32, i32* %9, align 4 + store i32 %38, i32* %7, align 4 + br label %39 + +39: ; preds = %49, %36 + %40 = load i32, i32* %7, align 4 + %41 = load i32, i32* %10, align 4 + %42 = icmp sle i32 %40, %41 + br i1 %42, label %43, label %52 + +43: ; preds = %39 + %44 = load i32, i32* %7, align 4 + %45 = mul nsw i32 %44, 1 + %46 = add nsw i32 1, %45 + store i32 %46, i32* %13, align 4 + %47 = load i32, i32* %13, align 4 + store i32 %47, i32* %28, align 4 + br label %48 + +48: ; preds = %43 + br label %49 + +49: ; preds = %48 + %50 = load i32, i32* %7, align 4 + %51 = add nsw i32 %50, 1 + store i32 %51, i32* %7, align 4 + br label %39 + +52: ; preds = %39 + br label %53 + +53: ; preds = %52 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30) + store i32 0, i32* %16, align 4 + store i32 99, i32* %17, align 4 + store i32 1, i32* %18, align 4 + store i32 0, i32* %19, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %19, i32* %16, i32* %17, i32* %18, i32 1, i32 1) + %54 = load i32, i32* %17, align 4 + %55 = icmp sgt i32 %54, 99 + br i1 %55, label %56, label %57 + +56: ; preds = %53 + br label %59 + +57: ; preds = %53 + %58 = load i32, i32* %17, align 4 + br label %59 + +59: ; preds = %57, %56 + %60 = phi i32 [ 99, %56 ], [ %58, %57 ] + store i32 %60, i32* %17, align 4 + %61 = load i32, i32* %16, align 4 + store i32 %61, i32* %14, align 4 + br label %62 + +62: ; preds = %72, %59 + %63 = load i32, i32* %14, align 4 + %64 = load i32, i32* %17, align 4 + %65 = icmp sle i32 %63, %64 + br i1 %65, label %66, label %75 + +66: ; preds = %62 + %67 = load i32, i32* %14, align 4 + %68 = mul nsw i32 %67, 1 + %69 = add nsw i32 0, %68 + store i32 %69, i32* %20, align 4 + %70 = load i32, i32* %20, align 4 + store i32 %70, i32* %28, align 4 + br label %71 + +71: ; preds = %66 + br label %72 + +72: ; preds = %71 + %73 = load i32, i32* %14, align 4 + %74 = add nsw i32 %73, 1 + store i32 %74, i32* %14, align 4 + br label %62 + +75: ; preds = %62 + br label %76 + +76: ; preds = %75 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30) + store i32 0, i32* %23, align 4 + store i32 99, i32* %24, align 4 + store i32 1, i32* %25, align 4 + store i32 0, i32* %26, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %30, i32 34, i32* %26, i32* %23, i32* %24, i32* %25, i32 1, i32 1) + %77 = load i32, i32* %24, align 4 + %78 = icmp sgt i32 %77, 99 + br i1 %78, label %79, label %80 + +79: ; preds = %76 + br label %82 + +80: ; preds = %76 + %81 = load i32, i32* %24, align 4 + br label %82 + +82: ; preds = %80, %79 + %83 = phi i32 [ 99, %79 ], [ %81, %80 ] + store i32 %83, i32* %24, align 4 + %84 = load i32, i32* %23, align 4 + store i32 %84, i32* %21, align 4 + br label %85 + +85: ; preds = %95, %82 + %86 = load i32, i32* %21, align 4 + %87 = load i32, i32* %24, align 4 + %88 = icmp sle i32 %86, %87 + br i1 %88, label %89, label %98 + +89: ; preds = %85 + %90 = load i32, i32* %21, align 4 + %91 = mul nsw i32 %90, 1 + %92 = add nsw i32 0, %91 + store i32 %92, i32* %27, align 4 + %93 = load i32, i32* %27, align 4 + store i32 %93, i32* %28, align 4 + br label %94 + +94: ; preds = %89 + br label %95 + +95: ; preds = %94 + %96 = load i32, i32* %21, align 4 + %97 = add nsw i32 %96, 1 + store i32 %97, i32* %21, align 4 + br label %85 + +98: ; preds = %85 + br label %99 + +99: ; preds = %98 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %30) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %30) + ret void +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define void @merge_conditional(i32 %0) #0 { + %2 = alloca i32, align 4 + %3 = alloca i32, align 4 + store i32 %0, i32* %2, align 4 + store i32 0, i32* %3, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +;;;;; Test-4 +;void merge_conditional(int x){ +; int a = 0; +; #pragma omp parallel +; { +; if (x < 10) +; { +; #pragma omp for +; for (int i=0; i < 100; i++) +; a=i; +; #pragma omp for +; for (int j=0; j < 100; j++) +; a=j; +; } +; else +; { +; #pragma omp for +; for (int k=0; k < 100; k++) +; a=k; +; } +; } +;} +;;; The OMP for loops in the first conditional block should be merged + + + + + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined..3(i32* noalias %0, i32* noalias %1, i32* nonnull align 4 dereferenceable(4) %2, i32* nonnull align 4 dereferenceable(4) %3) #1 { + %5 = alloca i32*, align 8 + %6 = alloca i32*, align 8 + %7 = alloca i32*, align 8 + %8 = alloca i32*, align 8 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + %21 = alloca i32, align 4 + %22 = alloca i32, align 4 + %23 = alloca i32, align 4 + %24 = alloca i32, align 4 + %25 = alloca i32, align 4 + %26 = alloca i32, align 4 + %27 = alloca i32, align 4 + %28 = alloca i32, align 4 + %29 = alloca i32, align 4 + store i32* %0, i32** %5, align 8 + store i32* %1, i32** %6, align 8 + store i32* %2, i32** %7, align 8 + store i32* %3, i32** %8, align 8 + %30 = load i32*, i32** %7, align 8 + %31 = load i32*, i32** %8, align 8 + %32 = load i32, i32* %30, align 4 + %33 = icmp slt i32 %32, 10 + br i1 %33, label %34, label %93 + +34: ; preds = %4 + store i32 0, i32* %11, align 4 + store i32 99, i32* %12, align 4 + store i32 1, i32* %13, align 4 + store i32 0, i32* %14, align 4 + %35 = load i32*, i32** %5, align 8 + %36 = load i32, i32* %35, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %36, i32 34, i32* %14, i32* %11, i32* %12, i32* %13, i32 1, i32 1) + %37 = load i32, i32* %12, align 4 + %38 = icmp sgt i32 %37, 99 + br i1 %38, label %39, label %40 + +39: ; preds = %34 + br label %42 + +40: ; preds = %34 + %41 = load i32, i32* %12, align 4 + br label %42 + +42: ; preds = %40, %39 + %43 = phi i32 [ 99, %39 ], [ %41, %40 ] + store i32 %43, i32* %12, align 4 + %44 = load i32, i32* %11, align 4 + store i32 %44, i32* %9, align 4 + br label %45 + +45: ; preds = %55, %42 + %46 = load i32, i32* %9, align 4 + %47 = load i32, i32* %12, align 4 + %48 = icmp sle i32 %46, %47 + br i1 %48, label %49, label %58 + +49: ; preds = %45 + %50 = load i32, i32* %9, align 4 + %51 = mul nsw i32 %50, 1 + %52 = add nsw i32 0, %51 + store i32 %52, i32* %15, align 4 + %53 = load i32, i32* %15, align 4 + store i32 %53, i32* %31, align 4 + br label %54 + +54: ; preds = %49 + br label %55 + +55: ; preds = %54 + %56 = load i32, i32* %9, align 4 + %57 = add nsw i32 %56, 1 + store i32 %57, i32* %9, align 4 + br label %45 + +58: ; preds = %45 + br label %59 + +59: ; preds = %58 + %60 = load i32*, i32** %5, align 8 + %61 = load i32, i32* %60, align 4 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %61) + %62 = load i32*, i32** %5, align 8 + %63 = load i32, i32* %62, align 4 + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %63) + store i32 0, i32* %18, align 4 + store i32 99, i32* %19, align 4 + store i32 1, i32* %20, align 4 + store i32 0, i32* %21, align 4 + %64 = load i32*, i32** %5, align 8 + %65 = load i32, i32* %64, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %65, i32 34, i32* %21, i32* %18, i32* %19, i32* %20, i32 1, i32 1) + %66 = load i32, i32* %19, align 4 + %67 = icmp sgt i32 %66, 99 + br i1 %67, label %68, label %69 + +68: ; preds = %59 + br label %71 + +69: ; preds = %59 + %70 = load i32, i32* %19, align 4 + br label %71 + +71: ; preds = %69, %68 + %72 = phi i32 [ 99, %68 ], [ %70, %69 ] + store i32 %72, i32* %19, align 4 + %73 = load i32, i32* %18, align 4 + store i32 %73, i32* %16, align 4 + br label %74 + +74: ; preds = %84, %71 + %75 = load i32, i32* %16, align 4 + %76 = load i32, i32* %19, align 4 + %77 = icmp sle i32 %75, %76 + br i1 %77, label %78, label %87 + +78: ; preds = %74 + %79 = load i32, i32* %16, align 4 + %80 = mul nsw i32 %79, 1 + %81 = add nsw i32 0, %80 + store i32 %81, i32* %22, align 4 + %82 = load i32, i32* %22, align 4 + store i32 %82, i32* %31, align 4 + br label %83 + +83: ; preds = %78 + br label %84 + +84: ; preds = %83 + %85 = load i32, i32* %16, align 4 + %86 = add nsw i32 %85, 1 + store i32 %86, i32* %16, align 4 + br label %74 + +87: ; preds = %74 + br label %88 + +88: ; preds = %87 + %89 = load i32*, i32** %5, align 8 + %90 = load i32, i32* %89, align 4 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %90) + %91 = load i32*, i32** %5, align 8 + %92 = load i32, i32* %91, align 4 + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %92) + br label %123 + +93: ; preds = %4 + store i32 0, i32* %25, align 4 + store i32 99, i32* %26, align 4 + store i32 1, i32* %27, align 4 + store i32 0, i32* %28, align 4 + %94 = load i32*, i32** %5, align 8 + %95 = load i32, i32* %94, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %95, i32 34, i32* %28, i32* %25, i32* %26, i32* %27, i32 1, i32 1) + %96 = load i32, i32* %26, align 4 + %97 = icmp sgt i32 %96, 99 + br i1 %97, label %98, label %99 + +98: ; preds = %93 + br label %101 + +99: ; preds = %93 + %100 = load i32, i32* %26, align 4 + br label %101 + +101: ; preds = %99, %98 + %102 = phi i32 [ 99, %98 ], [ %100, %99 ] + store i32 %102, i32* %26, align 4 + %103 = load i32, i32* %25, align 4 + store i32 %103, i32* %23, align 4 + br label %104 + +104: ; preds = %114, %101 + %105 = load i32, i32* %23, align 4 + %106 = load i32, i32* %26, align 4 + %107 = icmp sle i32 %105, %106 + br i1 %107, label %108, label %117 + +108: ; preds = %104 + %109 = load i32, i32* %23, align 4 + %110 = mul nsw i32 %109, 1 + %111 = add nsw i32 0, %110 + store i32 %111, i32* %29, align 4 + %112 = load i32, i32* %29, align 4 + store i32 %112, i32* %31, align 4 + br label %113 + +113: ; preds = %108 + br label %114 + +114: ; preds = %113 + %115 = load i32, i32* %23, align 4 + %116 = add nsw i32 %115, 1 + store i32 %116, i32* %23, align 4 + br label %104 + +117: ; preds = %104 + br label %118 + +118: ; preds = %117 + %119 = load i32*, i32** %5, align 8 + %120 = load i32, i32* %119, align 4 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %120) + %121 = load i32*, i32** %5, align 8 + %122 = load i32, i32* %121, align 4 + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %122) + br label %123 + +123: ; preds = %118, %88 + ret void +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define i32 @main() #0 { + %1 = alloca i32, align 4 + %2 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + %3 = call i32 (i8*, ...) @scanf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), i32* %2) + call void @merge_all() + call void @merge_none() + call void @merge_some() + %4 = load i32, i32* %2, align 4 + call void @merge_conditional(i32 %4) + ret i32 0 +} + +declare i32 @scanf(i8*, ...) #4 + +attributes #0 = { noinline nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } +attributes #3 = { convergent nounwind } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 15]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"PIC Level", i32 2} +!3 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 979bcbd3a6f7ea784f2098ad4cf613fbd6b09e38)"} +!4 = !{!5} +!5 = !{i64 2, i64 -1, i64 -1, i1 true} + + +;CHECK-LABEL: define void @merge_all() local_unnamed_addr #0{ +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]]) +;CHECK: define internal void @.omp_outlined.( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void +;CHECK-LABEL: define void @merge_none() local_unnamed_addr #0{ +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]]) +;CHECK: define internal void @.omp_outlined..1( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void +;CHECK-LABEL: define void @merge_some() local_unnamed_addr #0{ +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nonnull [[TMP2]]) +;CHECK: define internal void @.omp_outlined..2( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void +;CHECK-LABEL: define void @merge_conditional(i32 [[TMP1:%.*]]) local_unnamed_addr #0 { +;CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4 +;CHECK-NEXT: [[TMP3:%.*]] = alloca i32, align 4 +;CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP2]], align 4, !tbaa !4 +;CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @3, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nonnull %2, i32* nonnull [[TMP3]]) +;CHECK: br i1 [[TMP4:%.*]], label [[TMP5:%.*]], label [[TMP6:%.*]] +;CHECK-NEXT: [[TMP5]]: +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_barrier( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: [[TMP6]]: +;CHECK: call void @__kmpc_for_static_init_4( +;CHECK: call void @__kmpc_for_static_fini( +;CHECK: call void @__kmpc_barrier( +;CHECK: ret void diff --git a/llvm/test/Transforms/OpenMP/parallel_omp_for_loop_merge1.ll b/llvm/test/Transforms/OpenMP/parallel_omp_for_loop_merge1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/parallel_omp_for_loop_merge1.ll @@ -0,0 +1,375 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; RUN: opt -S -O3 < %s | FileCheck %s +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.15.0" +;;;;;;;;;;;;;;;;;; The last three loops should merge;;;;;;;;;;;;; +;#include +;#include +;#include +; +;int g(int); +;int f(int); +; +;int main(){ +; +;#pragma omp parallel +;{ +; #pragma omp for +; for (int i=0; i < 100; i++) +; printf("Before--> %d\n", g(i)); +; #pragma omp for +; for (int j=0; j < 10; j++) +; printf("After-->%d\n", f(j)); +; #pragma omp for +; for (int i=0; i < 10; i++) +; printf("Middle-->%d\n", g(i)); +; #pragma omp for +; for (int i=0; i < 10; i++) +; printf("Last-->%d\n", g(i)); +;} +; return 0; +;} +; +;int g(int i){ +;// i= i+1; +; return i; +;} +;int f(int j){ +;// j = j+1*j; +; +; return j; +;} +;;;;;;;;;;;;;;;;;;;;;;;; +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@.str = private unnamed_addr constant [14 x i8] c"Before--> %d\0A\00", align 1 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@.str.1 = private unnamed_addr constant [12 x i8] c"After-->%d\0A\00", align 1 +@.str.2 = private unnamed_addr constant [13 x i8] c"Middle-->%d\0A\00", align 1 +@.str.3 = private unnamed_addr constant [11 x i8] c"Last-->%d\0A\00", align 1 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 + +; Function Attrs: noinline nounwind optnone ssp uwtable +define i32 @g(i32 %0) #0 { + %2 = alloca i32, align 4 + store i32 %0, i32* %2, align 4 + %3 = load i32, i32* %2, align 4 + ret i32 %3 +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define i32 @f(i32 %0) #0 { + %2 = alloca i32, align 4 + store i32 %0, i32* %2, align 4 + %3 = load i32, i32* %2, align 4 + ret i32 %3 +} + +; Function Attrs: noinline nounwind optnone ssp uwtable +define i32 @main() #0 { + %1 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @3, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)) + ret i32 0 +} + +; Function Attrs: noinline norecurse nounwind optnone ssp uwtable +define internal void @.omp_outlined.(i32* noalias %0, i32* noalias %1) #1 { + %3 = alloca i32*, align 8 + %4 = alloca i32*, align 8 + %5 = alloca i32, align 4 + %6 = alloca i32, align 4 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %10 = alloca i32, align 4 + %11 = alloca i32, align 4 + %12 = alloca i32, align 4 + %13 = alloca i32, align 4 + %14 = alloca i32, align 4 + %15 = alloca i32, align 4 + %16 = alloca i32, align 4 + %17 = alloca i32, align 4 + %18 = alloca i32, align 4 + %19 = alloca i32, align 4 + %20 = alloca i32, align 4 + %21 = alloca i32, align 4 + %22 = alloca i32, align 4 + %23 = alloca i32, align 4 + %24 = alloca i32, align 4 + %25 = alloca i32, align 4 + %26 = alloca i32, align 4 + %27 = alloca i32, align 4 + %28 = alloca i32, align 4 + %29 = alloca i32, align 4 + %30 = alloca i32, align 4 + %31 = alloca i32, align 4 + %32 = alloca i32, align 4 + store i32* %0, i32** %3, align 8 + store i32* %1, i32** %4, align 8 + store i32 0, i32* %7, align 4 + store i32 99, i32* %8, align 4 + store i32 1, i32* %9, align 4 + store i32 0, i32* %10, align 4 + %33 = load i32*, i32** %3, align 8 + %34 = load i32, i32* %33, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %10, i32* %7, i32* %8, i32* %9, i32 1, i32 1) + %35 = load i32, i32* %8, align 4 + %36 = icmp sgt i32 %35, 99 + br i1 %36, label %37, label %38 + +37: ; preds = %2 + br label %40 + +38: ; preds = %2 + %39 = load i32, i32* %8, align 4 + br label %40 + +40: ; preds = %38, %37 + %41 = phi i32 [ 99, %37 ], [ %39, %38 ] + store i32 %41, i32* %8, align 4 + %42 = load i32, i32* %7, align 4 + store i32 %42, i32* %5, align 4 + br label %43 + +43: ; preds = %55, %40 + %44 = load i32, i32* %5, align 4 + %45 = load i32, i32* %8, align 4 + %46 = icmp sle i32 %44, %45 + br i1 %46, label %47, label %58 + +47: ; preds = %43 + %48 = load i32, i32* %5, align 4 + %49 = mul nsw i32 %48, 1 + %50 = add nsw i32 0, %49 + store i32 %50, i32* %11, align 4 + %51 = load i32, i32* %11, align 4 + %52 = call i32 @g(i32 %51) + %53 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str, i64 0, i64 0), i32 %52) + br label %54 + +54: ; preds = %47 + br label %55 + +55: ; preds = %54 + %56 = load i32, i32* %5, align 4 + %57 = add nsw i32 %56, 1 + store i32 %57, i32* %5, align 4 + br label %43 + +58: ; preds = %43 + br label %59 + +59: ; preds = %58 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + store i32 0, i32* %14, align 4 + store i32 9, i32* %15, align 4 + store i32 1, i32* %16, align 4 + store i32 0, i32* %17, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %17, i32* %14, i32* %15, i32* %16, i32 1, i32 1) + %60 = load i32, i32* %15, align 4 + %61 = icmp sgt i32 %60, 9 + br i1 %61, label %62, label %63 + +62: ; preds = %59 + br label %65 + +63: ; preds = %59 + %64 = load i32, i32* %15, align 4 + br label %65 + +65: ; preds = %63, %62 + %66 = phi i32 [ 9, %62 ], [ %64, %63 ] + store i32 %66, i32* %15, align 4 + %67 = load i32, i32* %14, align 4 + store i32 %67, i32* %12, align 4 + br label %68 + +68: ; preds = %80, %65 + %69 = load i32, i32* %12, align 4 + %70 = load i32, i32* %15, align 4 + %71 = icmp sle i32 %69, %70 + br i1 %71, label %72, label %83 + +72: ; preds = %68 + %73 = load i32, i32* %12, align 4 + %74 = mul nsw i32 %73, 1 + %75 = add nsw i32 0, %74 + store i32 %75, i32* %18, align 4 + %76 = load i32, i32* %18, align 4 + %77 = call i32 @f(i32 %76) + %78 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0), i32 %77) + br label %79 + +79: ; preds = %72 + br label %80 + +80: ; preds = %79 + %81 = load i32, i32* %12, align 4 + %82 = add nsw i32 %81, 1 + store i32 %82, i32* %12, align 4 + br label %68 + +83: ; preds = %68 + br label %84 + +84: ; preds = %83 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + store i32 0, i32* %21, align 4 + store i32 9, i32* %22, align 4 + store i32 1, i32* %23, align 4 + store i32 0, i32* %24, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %24, i32* %21, i32* %22, i32* %23, i32 1, i32 1) + %85 = load i32, i32* %22, align 4 + %86 = icmp sgt i32 %85, 9 + br i1 %86, label %87, label %88 + +87: ; preds = %84 + br label %90 + +88: ; preds = %84 + %89 = load i32, i32* %22, align 4 + br label %90 + +90: ; preds = %88, %87 + %91 = phi i32 [ 9, %87 ], [ %89, %88 ] + store i32 %91, i32* %22, align 4 + %92 = load i32, i32* %21, align 4 + store i32 %92, i32* %19, align 4 + br label %93 + +93: ; preds = %105, %90 + %94 = load i32, i32* %19, align 4 + %95 = load i32, i32* %22, align 4 + %96 = icmp sle i32 %94, %95 + br i1 %96, label %97, label %108 + +97: ; preds = %93 + %98 = load i32, i32* %19, align 4 + %99 = mul nsw i32 %98, 1 + %100 = add nsw i32 0, %99 + store i32 %100, i32* %25, align 4 + %101 = load i32, i32* %25, align 4 + %102 = call i32 @g(i32 %101) + %103 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.2, i64 0, i64 0), i32 %102) + br label %104 + +104: ; preds = %97 + br label %105 + +105: ; preds = %104 + %106 = load i32, i32* %19, align 4 + %107 = add nsw i32 %106, 1 + store i32 %107, i32* %19, align 4 + br label %93 + +108: ; preds = %93 + br label %109 + +109: ; preds = %108 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + store i32 0, i32* %28, align 4 + store i32 9, i32* %29, align 4 + store i32 1, i32* %30, align 4 + store i32 0, i32* %31, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %34, i32 34, i32* %31, i32* %28, i32* %29, i32* %30, i32 1, i32 1) + %110 = load i32, i32* %29, align 4 + %111 = icmp sgt i32 %110, 9 + br i1 %111, label %112, label %113 + +112: ; preds = %109 + br label %115 + +113: ; preds = %109 + %114 = load i32, i32* %29, align 4 + br label %115 + +115: ; preds = %113, %112 + %116 = phi i32 [ 9, %112 ], [ %114, %113 ] + store i32 %116, i32* %29, align 4 + %117 = load i32, i32* %28, align 4 + store i32 %117, i32* %26, align 4 + br label %118 + +118: ; preds = %130, %115 + %119 = load i32, i32* %26, align 4 + %120 = load i32, i32* %29, align 4 + %121 = icmp sle i32 %119, %120 + br i1 %121, label %122, label %133 + +122: ; preds = %118 + %123 = load i32, i32* %26, align 4 + %124 = mul nsw i32 %123, 1 + %125 = add nsw i32 0, %124 + store i32 %125, i32* %32, align 4 + %126 = load i32, i32* %32, align 4 + %127 = call i32 @g(i32 %126) + %128 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.3, i64 0, i64 0), i32 %127) + br label %129 + +129: ; preds = %122 + br label %130 + +130: ; preds = %129 + %131 = load i32, i32* %26, align 4 + %132 = add nsw i32 %131, 1 + store i32 %132, i32* %26, align 4 + br label %118 + +133: ; preds = %118 + br label %134 + +134: ; preds = %133 + call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %34) + call void @__kmpc_barrier(%struct.ident_t* @2, i32 %34) + ret void +} + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare i32 @printf(i8*, ...) #2 + +; Function Attrs: nounwind +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) #3 + +; Function Attrs: convergent nounwind +declare void @__kmpc_barrier(%struct.ident_t*, i32) #4 + +; Function Attrs: nounwind +declare !callback !4 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #3 + +attributes #0 = { noinline nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone ssp uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } +attributes #4 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 15]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"PIC Level", i32 2} +!3 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 979bcbd3a6f7ea784f2098ad4cf613fbd6b09e38)"} +!4 = !{!5} +!5 = !{i64 2, i64 -1, i64 -1, i1 true} +; +; CHECK: call void @__kmpc_for_static_init_4( +; CHECK: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK: call void @__kmpc_for_static_init_4( +; CHECK-NOT: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK-NOT: call void @__kmpc_for_static_init_4( +; CHECK-NOT: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK-NOT: call void @__kmpc_for_static_init_4( +; CHECK: call void @__kmpc_for_static_fini( +; CHECK: call void @__kmpc_barrier( +; CHECK: ret void +; diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -94,6 +94,7 @@ static kmp_int8 warn = 0; + if (ompt_enabled.ompt_callback_work) { // Only fully initialize variables needed by OMPT if OMPT is enabled. team_info = __ompt_get_teaminfo(0, NULL);