diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c --- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c +++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out -// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out // host-no-diagnostics void bar1(void) { #pragma omp parallel // #0 // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} - // safe-remark@#0 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}} + // safe-remark@#0 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}} // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: }} { } @@ -15,7 +15,7 @@ void bar2(void) { #pragma omp parallel // #1 // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} - // safe-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}} + // safe-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}} // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: }} { } diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c --- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c +++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -verify=host -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out -// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out // host-no-diagnostics void bar(void) { #pragma omp parallel // #1 \ // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ - // expected-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}} + // expected-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}} { } } diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1083,13 +1083,15 @@ /// the abstract attributes. /// \param CGUpdater Helper to update an underlying call graph. /// \param Allowed If not null, a set limiting the attribute opportunities. - /// \param DeleteFns Whether to delete functions + /// \param DeleteFns Whether to delete functions. + /// \param RewriteSignatures Whether to rewrite function signatures. Attributor(SetVector &Functions, InformationCache &InfoCache, CallGraphUpdater &CGUpdater, - DenseSet *Allowed = nullptr, bool DeleteFns = true) + DenseSet *Allowed = nullptr, bool DeleteFns = true, + bool RewriteSignatures = true) : Allocator(InfoCache.Allocator), Functions(Functions), InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed), - DeleteFns(DeleteFns) {} + DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures) {} ~Attributor(); @@ -1665,6 +1667,21 @@ /// static void createShallowWrapper(Function &F); + /// Make another copy of the function \p F such that the copied version has + /// internal linkage afterwards and can be analysed. Then we replace all uses + /// of the original function to the copied one + /// + /// Only non-locally linked functions that have `linkonce_odr` or `weak_odr` + /// linkage can be internalized because these linkages guarantee that other + /// definitions with the same name have the same semantics as this one. + /// + /// This will only be run if the `attributor-allow-deep-wrappers` option is + /// set, or if the function is called with \p Force set to true. + /// + /// If the function \p F failed to be internalized the return value will be a + /// null pointer. + static Function *internalizeFunction(Function &F, bool Force = false); + /// Return the data layout associated with the anchor scope. const DataLayout &getDataLayout() const { return InfoCache.DL; } @@ -1777,6 +1794,9 @@ /// Whether to delete functions. const bool DeleteFns; + /// Whether to rewrite signatures. + const bool RewriteSignatures; + /// A set to remember the functions we already assume to be live and visited. DenseSet VisitedFunctions; diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -1621,19 +1621,12 @@ NumFnShallowWrappersCreated++; } -/// Make another copy of the function \p F such that the copied version has -/// internal linkage afterwards and can be analysed. Then we replace all uses -/// of the original function to the copied one -/// -/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr` -/// linkage can be internalized because these linkages guarantee that other -/// definitions with the same name have the same semantics as this one -/// -static Function *internalizeFunction(Function &F) { - assert(AllowDeepWrapper && "Cannot create a copy if not allowed."); - assert(!F.isDeclaration() && !F.hasExactDefinition() && - !GlobalValue::isInterposableLinkage(F.getLinkage()) && - "Trying to internalize function which cannot be internalized."); +Function *Attributor::internalizeFunction(Function &F, bool Force) { + if (!AllowDeepWrapper && !Force) + return nullptr; + if (F.isDeclaration() || F.hasLocalLinkage() || + GlobalValue::isInterposableLinkage(F.getLinkage())) + return nullptr; Module &M = *F.getParent(); FunctionType *FnTy = F.getFunctionType(); @@ -1663,7 +1656,8 @@ SmallVector, 1> MDs; F.getAllMetadata(MDs); for (auto MDIt : MDs) - Copied->addMetadata(MDIt.first, *MDIt.second); + if (!Copied->hasMetadata()) + Copied->addMetadata(MDIt.first, *MDIt.second); M.getFunctionList().insert(F.getIterator(), Copied); F.replaceAllUsesWith(Copied); @@ -1675,6 +1669,9 @@ bool Attributor::isValidFunctionSignatureRewrite( Argument &Arg, ArrayRef ReplacementTypes) { + if (!RewriteSignatures) + return false; + auto CallSiteCanBeChanged = [](AbstractCallSite ACS) { // Forbid the call site to cast the function return type. If we need to // rewrite these functions we need to re-create a cast for the new call site @@ -2459,7 +2456,8 @@ Function *F = Functions[u]; if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() && !GlobalValue::isInterposableLinkage(F->getLinkage())) { - Function *NewF = internalizeFunction(*F); + Function *NewF = Attributor::internalizeFunction(*F); + assert(NewF && "Could not internalize function."); Functions.insert(NewF); // Update call graph diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1623,9 +1623,9 @@ }; GlobalizationRFI.foreachUse(SCC, CreateAA); - for (auto &F : M) { - if (!F.isDeclaration()) - A.getOrCreateAAFor(IRPosition::function(F)); + for (auto *F : SCC) { + if (!F->isDeclaration()) + A.getOrCreateAAFor(IRPosition::function(*F)); } } }; @@ -2620,11 +2620,19 @@ if (DisableOpenMPOptimizations) return PreservedAnalyses::all(); - // Look at every function definition in the Module. + // Create internal copies of each function if this is a kernel Module. + DenseSet InternalizedFuncs; + if (!OMPInModule.getKernels().empty()) + for (Function &F : M) + if (!F.isDeclaration() && !OMPInModule.getKernels().contains(&F)) + if (Attributor::internalizeFunction(F, /* Force */ true)) + InternalizedFuncs.insert(&F); + + // Look at every function definition in the Module that wasn't internalized. SmallVector SCC; - for (Function &Fn : M) - if (!Fn.isDeclaration()) - SCC.push_back(&Fn); + for (Function &F : M) + if (!F.isDeclaration() && !InternalizedFuncs.contains(&F)) + SCC.push_back(&F); if (SCC.empty()) return PreservedAnalyses::all(); @@ -2645,7 +2653,7 @@ OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, OMPInModule.getKernels()); - Attributor A(Functions, InfoCache, CGUpdater); + Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(true); diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -2,6 +2,8 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64" +@S = external local_unnamed_addr global i8* + ; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef ; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef @@ -67,7 +69,7 @@ define void @use(i8* %x) { entry: %addr = alloca i8* - store i8* %x, i8** %addr + store i8* %x, i8** @S ret void } diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -1,8 +1,8 @@ -; RUN: opt -passes=openmp-opt-cgscc -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -passes=openmp-opt -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ModuleID = 'single_threaded_exeuction.c' -define void @kernel() { +define weak void @kernel() { call void @__kmpc_kernel_init(i32 512, i16 1) call void @nvptx() call void @amdgcn() @@ -12,14 +12,15 @@ ; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread. ; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread. -; Function Attrs: noinline nounwind uwtable -define dso_local void @nvptx() { +; Function Attrs: noinline +define internal void @nvptx() { entry: %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %cmp = icmp eq i32 %call, 0 br i1 %cmp, label %if.then, label %if.end if.then: + call void @foo() call void @bar() br label %if.end @@ -30,14 +31,15 @@ ; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. ; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. -; Function Attrs: noinline nounwind uwtable -define dso_local void @amdgcn() { +; Function Attrs: noinline +define internal void @amdgcn() { entry: %call = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %call, 0 br i1 %cmp, label %if.then, label %if.end if.then: + call void @foo() call void @bar() br label %if.end @@ -45,9 +47,16 @@ ret void } -; CHECK: [openmp-opt] Basic block @bar entry is executed by a single thread. -; Function Attrs: noinline nounwind uwtable -define internal void @bar() { +; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread. +; Function Attrs: noinline +define internal void @foo() { +entry: + ret void +} + +; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread. +; Function Attrs: noinline +define void @bar() { entry: ret void }