diff --git a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h @@ -0,0 +1,26 @@ +//===- IPO/OpenMPOpt.h - Collection of OpenMP optimizations -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_OPENMP_OPT_H +#define LLVM_TRANSFORMS_IPO_OPENMP_OPT_H + +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// OpenMP optimizations pass. +struct OpenMPOptPass : public PassInfoMixin { + PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, + LazyCallGraph &CG, CGSCCUpdateResult &UR); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_OPENMP_OPT_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -85,6 +85,7 @@ #include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/IPO/LowerTypeTests.h" +#include "llvm/Transforms/IPO/OpenMPOpt.h" #include "llvm/Transforms/IPO/PartialInlining.h" #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Transforms/IPO/SampleProfile.h" @@ -816,6 +817,10 @@ if (Level == O3) MainCGPipeline.addPass(ArgumentPromotionPass()); + // Try to perform OpenMP specific optimizations. This is a no-op if there are + // no OpenMP runtime calls present in the module. + MainCGPipeline.addPass(OpenMPOptPass()); + // Lastly, add the core function simplification pipeline nested inside the // CGSCC walk. MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -108,6 +108,7 @@ CGSCC_PASS("invalidate", InvalidateAllAnalysesPass()) CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass()) CGSCC_PASS("inline", InlinerPass()) +CGSCC_PASS("openmpopt", OpenMPOptPass()) CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass()) #undef CGSCC_PASS diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -26,6 +26,7 @@ LoopExtractor.cpp LowerTypeTests.cpp MergeFunctions.cpp + OpenMPOpt.cpp PartialInlining.cpp PassManagerBuilder.cpp PruneEH.cpp diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -0,0 +1,307 @@ +//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// OpenMP specific optimizations +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/OpenMPOpt.h" + +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/OpenMPConstants.h" + +using namespace llvm; +using namespace omp; + +#define DEBUG_TYPE "openmp-opt" + +static cl::opt DisableOpenMPOptimizations( + "openmp-opt-disable", cl::ZeroOrMore, + cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, + cl::init(false)); + +static constexpr auto Tag = "[OpenMPOpt] "; + +namespace { +struct OpenMPOpt { + + OpenMPOpt(SmallPtrSetImpl &SCC, + SmallPtrSetImpl &ModuleSlice) + : M(*(*SCC.begin())->getParent()), SCC(SCC), ModuleSlice(ModuleSlice) {} + + /// Declarations for OpenMP runtime functions + /// + ///{ + + /// Generic information that describes a runtime function + struct RuntimeFunctionInfo { + /// The kind, as described by the RuntimeFunction enum. + RuntimeFunction Kind; + + /// The name of the function. + StringRef Name; + + /// Flag to indicate a variadic function. + bool IsVarARg; + + /// The return type of the function. + Type *ReturnType; + + /// The argument types of the function. + SmallVector ArgumentTypes; + + /// The declaration if available. + Function *Declaration; + + /// Uses of this runtime function per function containing the use. + DenseMap> UsesMap; + + /// Return the number of arguments (or the minimal number for variadic + /// functions). + size_t getNumArgs() const { return ArgumentTypes.size(); } + }; + + ///} + + /// Declarations for LLVM-IR types (simple and structure) are generated below. + /// Their names are defined and used in OpenMPKinds.def. Here we provide the + /// declarations, the initializeTypes function will provide the values. + /// + ///{ + +#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr; +#define OMP_STRUCT_TYPE(VarName, StrName, ...) Type *VarName = nullptr; +#include "llvm/IR/OpenMPKinds.def" + + ///} + + bool run() { + initializeTypes(M); + if (!initializeRuntimeFunctions(M)) { + DisableOpenMPOptimizations = true; + return false; + } + + bool Changed = false; + LLVM_DEBUG(dbgs() << Tag << "Run on SCC with " << SCC.size() + << " functions in a slice with " << ModuleSlice.size() + << " functions\n"); + Changed |= deduplicateRuntimeCalls(); + return Changed; + } + +private: + /// Try to eliminiate runtime calls by reuseing existing ones. + bool deduplicateRuntimeCalls() { + bool Changed = false; + + SmallSetVector GTIdArgs; + collectGlobalThreadIdArguments(GTIdArgs); + LLVM_DEBUG(dbgs() << Tag << "Found " << GTIdArgs.size() + << " global thread ID arguments\n"); + + for (Function *F : SCC) { + Value *GTIdArg = nullptr; + llvm::any_of(F->args(), [&](Argument &Arg) { + return GTIdArg = GTIdArgs.count(&Arg) ? &Arg : nullptr; + }); + Changed |= deduplicateRuntimeCalls( + *F, RTIs[OMPRTL___kmpc_global_thread_num], GTIdArg); + } + + return Changed; + } + + /// Try to eliminiate calls of \p RTI in \p F by reuseing an existing one or + /// \p ReplVal if given. + bool deduplicateRuntimeCalls(Function &F, RuntimeFunctionInfo &RTI, + Value *ReplVal) { + auto &Uses = RTI.UsesMap[&F]; + if (Uses.size() + (ReplVal != nullptr) < 2) + return false; + + LLVM_DEBUG(dbgs() << Tag << "Deduplicate " << Uses.size() << " uses of " + << RTI.Name + << (ReplVal ? " with an existing value\n" : "\n") + << "\n"); + + bool Changed = false; + for (Use *U : Uses) { + CallInst *CI = getCallIfRegularCall(*U, &RTI); + if (!CI) + continue; + if (!ReplVal) { + CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); + ReplVal = CI; + continue; + } + CI->replaceAllUsesWith(ReplVal); + CI->eraseFromParent(); + Changed = true; + } + + return Changed; + } + + /// Collect arguments that represent the global thread id in \p GTIdArgs. + void collectGlobalThreadIdArguments(SmallSetVector >IdArgs) { + // TODO: Below we basically perform a fixpoint iteration with a pessimistic + // initialization. We could define an AbstractAttribute instead and + // run the Attributor here once it can be run as an SCC pass. + + // Helper to check the argument \p ArgNo at all call sites of \p F for + // a GTId. + auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { + if (!F.hasLocalLinkage()) + return false; + for (Use &U : F.uses()) { + if (CallInst *CI = getCallIfRegularCall(U)) { + Value *ArgOp = CI->getArgOperand(ArgNo); + if (CI == &RefCI || GTIdArgs.count(ArgOp) || + getCallIfRegularCall(*ArgOp, + &RTIs[OMPRTL___kmpc_global_thread_num])) + continue; + } + return false; + } + return true; + }; + + // Helper to identify uses of a GTId as GTId arguments. + auto AddUserArgs = [&](Value >Id) { + for (Use &U : GTId.uses()) + if (CallInst *CI = dyn_cast(U.getUser())) + if (CI->isArgOperand(&U)) + if (Function *Callee = CI->getCalledFunction()) + if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) + GTIdArgs.insert(Callee->getArg(U.getOperandNo())); + }; + + // The argument users of __kmpc_global_thread_num calls are GTIds. + RuntimeFunctionInfo &GlobThreadNumRTI = + RTIs[OMPRTL___kmpc_global_thread_num]; + for (auto &It : GlobThreadNumRTI.UsesMap) + for (Use *U : It.second) + if (CallInst *CI = getCallIfRegularCall(*U, &GlobThreadNumRTI)) + AddUserArgs(*CI); + + // Transitively search for more arguments by looking at the users of the + // ones we know already. + for (unsigned u = 0; u < GTIdArgs.size(); ++u) + AddUserArgs(*GTIdArgs[u]); + } + + /// Return the call if \p U is a callee use in a regular call. If \p RTI is + /// given it has to be the callee or a nullptr is returned. + CallInst *getCallIfRegularCall(Use &U, RuntimeFunctionInfo *RTI = nullptr) { + CallInst *CI = dyn_cast(U.getUser()); + if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && + (!RTI || CI->getCalledFunction() == RTI->Declaration)) + return CI; + return nullptr; + } + + /// Return the call if \p V is a regular call. If \p RTI is given it has to be + /// the callee or a nullptr is returned. + CallInst *getCallIfRegularCall(Value &V, RuntimeFunctionInfo *RTI = nullptr) { + CallInst *CI = dyn_cast(&V); + if (CI && !CI->hasOperandBundles() && + (!RTI || CI->getCalledFunction() == RTI->Declaration)) + return CI; + return nullptr; + } + + /// Helper to initialize all types defined in OpenMPKinds.def. + void initializeTypes(Module &M) { + LLVMContext &Ctx = M.getContext(); + // Create all simple and struct types exposed by the runtime and remember + // the llvm::PointerTypes of them for easy access later. + Type *T; +#define OMP_TYPE(VarName, InitValue) VarName = InitValue; +#define OMP_STRUCT_TYPE(VarName, StructName, ...) \ + T = M.getTypeByName(StructName); \ + if (!T) \ + T = StructType::create(Ctx, {__VA_ARGS__}, StructName); \ + VarName = PointerType::getUnqual(T); +#include "llvm/IR/OpenMPKinds.def" + } + + /// Helper to initialize all runtime function information for those defined in + /// OpenMPKinds.def. + bool initializeRuntimeFunctions(Module &M) { + bool FoundAny = false; +#define OMP_RTL(_Enum, _Name, _IsVarARg, _ReturnType, ...) \ + { \ + auto &RTI = RTIs[_Enum]; \ + RTI.Kind = _Enum; \ + RTI.Name = _Name; \ + RTI.IsVarARg = _IsVarARg; \ + RTI.ReturnType = _ReturnType; \ + RTI.ArgumentTypes = SmallVector({__VA_ARGS__}); \ + unsigned NumUses = 0; \ + if ((RTI.Declaration = M.getFunction(_Name))) { \ + FoundAny = true; \ + for (Use & U : RTI.Declaration->uses()) { \ + if (Instruction *UserI = dyn_cast(U.getUser())) { \ + if (ModuleSlice.count(UserI->getFunction())) { \ + RTI.UsesMap[UserI->getFunction()].push_back(&U); \ + ++NumUses; \ + } \ + } else { \ + RTI.UsesMap[nullptr].push_back(&U); \ + ++NumUses; \ + } \ + } \ + } \ + (void)NumUses; \ + LLVM_DEBUG({ \ + dbgs() << Tag << RTI.Name << (RTI.Declaration ? "" : " not") \ + << " found\n"; \ + if (RTI.Declaration) \ + dbgs() << Tag << "-> got " << NumUses << " uses in " \ + << RTI.UsesMap.size() << " different functions.\n"; \ + }); \ + } +#include "llvm/IR/OpenMPKinds.def" + // TODO: We should validate the declaration agains the types we expect. + return FoundAny; + } + + /// The underyling module. + Module &M; + + /// The SCC we are operating on. + SmallPtrSetImpl &SCC; + + /// The slice of the module we are allowed to look at. + SmallPtrSetImpl &ModuleSlice; + + /// Map from runtime function kind to the runtime function description. + std::map RTIs; +}; +} // namespace + +PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, + CGSCCAnalysisManager &AM, + LazyCallGraph &CG, CGSCCUpdateResult &UR) { + if (DisableOpenMPOptimizations) + return PreservedAnalyses::all(); + + SmallPtrSet SCC; + for (LazyCallGraph::Node &N : C) + SCC.insert(&N.getFunction()); + if (SCC.empty()) + return PreservedAnalyses::all(); + + // TODO: Compute the module slice we are allowed to look at. + OpenMPOpt OMPOpt(SCC, SCC); + bool Changed = OMPOpt.run(); + (void)Changed; + return PreservedAnalyses::all(); +} diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -127,6 +127,7 @@ ; CHECK-O-NEXT: Running pass: InlinerPass ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass +; CHECK-O-NEXT: Running pass: OpenMPOptPass on (foo) ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O-NEXT: Starting llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: SROA diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -107,6 +107,7 @@ ; CHECK-O-NEXT: Running pass: InlinerPass ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass +; CHECK-O-NEXT: Running pass: OpenMPOptPass on (foo) ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O-NEXT: Starting llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: SROA diff --git a/llvm/test/Transforms/OpenMP/gtid.ll b/llvm/test/Transforms/OpenMP/gtid.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/gtid.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -passes=openmpopt -S < %s | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) +declare void @useI32(i32) + +define void @external(i1 %c) { +; CHECK-LABEL: define {{[^@]+}}@external +; CHECK-SAME: (i1 [[C:%.*]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C2:%.*]] = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) +; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[E:%.*]] +; CHECK: t: +; CHECK-NEXT: call void @internal(i32 [[C2]], i32 [[C2]]) +; CHECK-NEXT: call void @useI32(i32 [[C2]]) +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: e: +; CHECK-NEXT: call void @internal(i32 [[C2]], i32 [[C2]]) +; CHECK-NEXT: call void @useI32(i32 [[C2]]) +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: call void @internal(i32 0, i32 [[C2]]) +; CHECK-NEXT: call void @useI32(i32 [[C2]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %t, label %e +t: + %c0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + call void @internal(i32 %c0, i32 %c0) + call void @useI32(i32 %c0) + br label %m +e: + %c1 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + call void @internal(i32 %c1, i32 %c1) + call void @useI32(i32 %c1) + br label %m +m: + %c2 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + call void @internal(i32 0, i32 %c2) + call void @useI32(i32 %c2) + ret void +} + +define internal void @internal(i32 %not_gtid, i32 %gtid) { +; CHECK-LABEL: define {{[^@]+}}@internal +; CHECK-SAME: (i32 [[NOT_GTID:%.*]], i32 [[GTID:%.*]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[GTID]], [[GTID]] +; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[E:%.*]] +; CHECK: t: +; CHECK-NEXT: call void @useI32(i32 [[GTID]]) +; CHECK-NEXT: call void @external(i1 [[C]]) +; CHECK-NEXT: br label [[M:%.*]] +; CHECK: e: +; CHECK-NEXT: call void @useI32(i32 [[GTID]]) +; CHECK-NEXT: br label [[M]] +; CHECK: m: +; CHECK-NEXT: call void @useI32(i32 [[GTID]]) +; CHECK-NEXT: ret void +; +entry: + %cc = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + %c = icmp eq i32 %cc, %gtid + br i1 %c, label %t, label %e +t: + %c0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + call void @useI32(i32 %c0) + call void @external(i1 %c) + br label %m +e: + %c1 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + call void @useI32(i32 %c1) + br label %m +m: + %c2 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0) + call void @useI32(i32 %c2) + ret void +}