diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -3839,6 +3839,42 @@ static const char ID; }; +struct AAExecutionDomain + : public StateWrapper { + using Base = StateWrapper; + AAExecutionDomain(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Returns true if HeapToStack conversion is assumed to be possible. + bool isAssumedSingleThreaded() const { return getAssumed(); } + + /// Returns true if HeapToStack conversion is known to be possible. + bool isKnownSingleThreaded() const { return getKnown(); } + + /// Create an abstract attribute view for the position \p IRP. + static AAExecutionDomain &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAExecutionDomain"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// Check if an instruction is executed by a single thread. + virtual bool isSingleThreadExeuction(const Instruction *) = 0; + + virtual bool isSingleThreadExeuction(const BasicBlock *) = 0; + + /// This function should return true if the type of the \p AA is + /// AAExecutionDomain. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; + /// Run options, used by the pass manager. enum AttributorRunOption { NONE = 0, diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -134,6 +135,7 @@ PIPE_OPERATOR(AAUndefinedBehavior) PIPE_OPERATOR(AAPotentialValues) PIPE_OPERATOR(AANoUndef) +PIPE_OPERATOR(AAExecutionDomain) #undef PIPE_OPERATOR } // namespace llvm @@ -4342,6 +4344,7 @@ unsigned &RemainingUsesToExplore; }; +// interested in AA for specific positions. ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { const IRPosition &IRP = getIRPosition(); const Value *V = isArgumentPosition() ? IRP.getAssociatedArgument() @@ -8115,6 +8118,93 @@ /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) } }; + +/// ----------------------- Execution Domain Analysis -------------------------- + +struct AAExecutionDomainImpl : public AAExecutionDomain { + AAExecutionDomainImpl(const IRPosition &IRP, Attributor &A) + : AAExecutionDomain(IRP, A) {} + + const std::string getAsStr() const override { return "[ED]"; } + + ChangeStatus manifest(Attributor &A) override { + return ChangeStatus::UNCHANGED; + } + + ChangeStatus updateImpl(Attributor &A) override; + + /// Check if an instruction is executed by a single thread. + bool isSingleThreadExeuction(const Instruction *I) override { + return isSingleThreadExeuction(I->getParent()); + } + + bool isSingleThreadExeuction(const BasicBlock *BB) override { + return SingleThreadedBBs[BB]; + } + + DenseMap SingleThreadedBBs; +}; + +ChangeStatus AAExecutionDomainImpl::updateImpl(Attributor &A) { + Function *F = getAnchorScope(); + llvm::ReversePostOrderTraversal RPOT(F); + + // TODO: Simplify this value to propogate constants. + // TODO: Check more than a single use. + // TODO: Add a LibraryInfo for functions that return a thread ID. + auto IsMasterEdge = [](BranchInst *Edge, BasicBlock *TrueBB) { + if (!Edge || !Edge->isConditional()) + return false; + if (Edge->getSuccessor(0) != TrueBB) + return false; + + auto *Cmp = dyn_cast(Edge->getCondition()); + if (!Cmp || !Cmp->isTrueWhenEqual()) + return false; + + ConstantInt *C = dyn_cast(Cmp->getOperand(1)); + if (!C || !C->isZero()) + return false; + ; + if (auto *CB = dyn_cast(Cmp->getOperand(0))) { + if (CB->getCalledFunction()->getName() == "omp_get_thread_num" || + CB->getCalledFunction()->getName() == "__kmpc_global_thread_num" || + CB->getCalledFunction()->getName() == "__kmpc_master") + return true; + } + + return false; + }; + + // TODO: Handle more complex cases than only the master thread. + auto MergePredecessorStates = [&](BasicBlock *BB) { + if (pred_begin(BB) == pred_end(BB)) + return false; + + bool IsSingleThreaded = true; + for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { + if (!IsMasterEdge(dyn_cast((*I)->getTerminator()), BB)) + IsSingleThreaded &= SingleThreadedBBs[*I]; + } + + return IsSingleThreaded; + }; + + for (auto *BB : RPOT) { + SingleThreadedBBs[BB] = MergePredecessorStates(BB); + } + + return ChangeStatus::UNCHANGED; +} + +struct AAExecutionDomainFunction final : public AAExecutionDomainImpl { + AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) + : AAExecutionDomainImpl(IRP, A) {} + + /// See AbstractAttribute::trackStatistics(). + void trackStatistics() const override {} +}; + } // namespace const char AAReturnedValues::ID = 0; @@ -8140,6 +8230,7 @@ const char AAValueConstantRange::ID = 0; const char AAPotentialValues::ID = 0; const char AANoUndef::ID = 0; +const char AAExecutionDomain::ID = 0; // Macro magic to create the static generator function for attributes that // follow the naming scheme. @@ -8259,6 +8350,7 @@ CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior) +CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAExecutionDomain) CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -56,6 +56,9 @@ " transfers"), cl::Hidden, cl::init(false)); +static cl::opt IdentifySingleThreads("openmp-identify-single-threads", + cl::init(false), cl::Hidden); + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -505,9 +508,13 @@ << " functions in a slice with " << OMPInfoCache.ModuleSlice.size() << " functions\n"); + Changed |= runAttributor(); + if (IsModulePass) { if (remarksEnabled()) analysisGlobalization(); + if (IdentifySingleThreads) + identifySingleThreadedBBs(); } else { if (PrintICVValues) printICVs(); @@ -516,7 +523,6 @@ Changed |= rewriteDeviceCodeStateMachine(); - Changed |= runAttributor(); // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); @@ -1115,6 +1121,25 @@ return Changed; } + void identifySingleThreadedBBs() { + for (Function &F : M) { + IRPosition FnPos = IRPosition::function(F); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (auto ED = A.lookupAAFor(FnPos)) + if (!ED->isSingleThreadExeuction(&I)) + continue; + + auto Remark = [&](OptimizationRemarkAnalysis ORA) { + return ORA << "Found Instruction executed by a single thread."; + }; + emitRemark(&I, "OpenMPSingleThread", + Remark); + } + } + } + } + void analysisGlobalization() { RuntimeFunction GlobalizationRuntimeIDs[] = { OMPRTL___kmpc_data_sharing_coalesced_push_stack, @@ -1606,6 +1631,13 @@ GetterRFI.foreachUse(SCC, CreateAA); } + for (auto &F : M) { + if (F.isDeclaration()) + continue; + + IRPosition FnPos = IRPosition::function(F); + A.getOrCreateAAFor(FnPos); + } } }; diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -0,0 +1,89 @@ +; RUN: opt -passes=openmp-opt -pass-remarks-analysis=openmp-opt -openmp-identify-single-threads -disable-output %s 2>&1 | FileCheck %s +; ModuleID = 'single_threaded_execution.c' +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@x = dso_local global i32 0, align 4 +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 +@0 = private unnamed_addr constant [40 x i8] c";single_threaded_execution.c;main;7;1;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([40 x i8], [40 x i8]* @0, i32 0, i32 0) }, align 8 + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @main() #0 !dbg !7 { +entry: + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)), !dbg !9 + ret i32 0, !dbg !10 +} + +; Function Attrs: noinline norecurse nounwind uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 !dbg !11 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %call = call i32 @omp_get_thread_num(), !dbg !12 + %cmp = icmp eq i32 %call, 0, !dbg !13 + br i1 %cmp, label %if.then, label %if.end5, !dbg !12 + +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +if.then: ; preds = %entry + %0 = load i32, i32* @x, align 4, !dbg !14 + %cmp1 = icmp eq i32 %0, 1, !dbg !15 + br i1 %cmp1, label %if.then2, label %if.end, !dbg !14 + +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +if.then2: ; preds = %if.then + %call3 = call i32 @omp_get_thread_num(), !dbg !16 + %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %call3), !dbg !17 + br label %if.end, !dbg !17 + +; CHECK: remark: single_threaded_execution.c:{{[0-9]+}} +if.end: ; preds = %if.then2, %if.then + br label %if.end5, !dbg !18 + +if.end5: ; preds = %if.end, %entry + ret void, !dbg !19 +} + +declare dso_local i32 @omp_get_thread_num() #2 + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: nounwind +declare !callback !20 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #3 + +attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { noinline norecurse nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "single_threaded_execution.c", directory: "/home/jhuber/Documents/llvm-project/build") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 13.0.0"} +!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !8, scopeLine: 6, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 7, column: 1, scope: !7) +!10 = !DILocation(line: 13, column: 1, scope: !7) +!11 = distinct !DISubprogram(name: ".omp_outlined.", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2) +!12 = !DILocation(line: 9, column: 9, scope: !11) +!13 = !DILocation(line: 9, column: 30, scope: !11) +!14 = !DILocation(line: 10, column: 11, scope: !11) +!15 = !DILocation(line: 10, column: 13, scope: !11) +!16 = !DILocation(line: 11, column: 25, scope: !11) +!17 = !DILocation(line: 11, column: 9, scope: !11) +!18 = !DILocation(line: 10, column: 16, scope: !11) +!19 = !DILocation(line: 12, column: 3, scope: !11) +!20 = !{!21} +!21 = !{i64 2, i64 -1, i64 -1, i1 true}