diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1                                 -verify=host      -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1                                 -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1                                 -verify=host      -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
 
 // host-no-diagnostics
 
 void bar1(void) {
 #pragma omp parallel // #0
                      // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
-                     // safe-remark@#0 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+                     // safe-remark@#0 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
                      // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
   {
   }
@@ -15,7 +15,7 @@
 void bar2(void) {
 #pragma omp parallel // #1
                      // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
-                     // safe-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+                     // safe-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
                      // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
   {
   }
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1                                 -verify=host -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1                                 -verify      -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1                                 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify      -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
 
 // host-no-diagnostics
 
 void bar(void) {
 #pragma omp parallel // #1                                                                                                                                                                                                                                                                                                                                           \
                      // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
-                     // expected-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+                     // expected-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
   {
   }
 }
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1083,13 +1083,15 @@
   ///                  the abstract attributes.
   /// \param CGUpdater Helper to update an underlying call graph.
   /// \param Allowed If not null, a set limiting the attribute opportunities.
-  /// \param DeleteFns Whether to delete functions
+  /// \param DeleteFns Whether to delete functions.
+  /// \param RewriteSignatures Whether to rewrite function signatures.
   Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
              CallGraphUpdater &CGUpdater,
-             DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true)
+             DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
+             bool RewriteSignatures = true)
       : Allocator(InfoCache.Allocator), Functions(Functions),
         InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
-        DeleteFns(DeleteFns) {}
+        DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures) {}
 
   ~Attributor();
 
@@ -1665,6 +1667,21 @@
   ///
   static void createShallowWrapper(Function &F);
 
+  /// Make another copy of the function \p F such that the copied version has
+  /// internal linkage afterwards and can be analysed. Then we replace all uses
+  /// of the original function to the copied one
+  ///
+  /// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
+  /// linkage can be internalized because these linkages guarantee that other
+  /// definitions with the same name have the same semantics as this one.
+  ///
+  /// This will only be run if the `attributor-allow-deep-wrappers` option is
+  /// set, or if the function is called with \p Force set to true.
+  ///
+  /// If the function \p F failed to be internalized the return value will be a
+  /// null pointer.
+  static Function *internalizeFunction(Function &F, bool Force = false);
+
   /// Return the data layout associated with the anchor scope.
   const DataLayout &getDataLayout() const { return InfoCache.DL; }
 
@@ -1777,6 +1794,9 @@
   /// Whether to delete functions.
   const bool DeleteFns;
 
+  /// Whether to rewrite signatures.
+  const bool RewriteSignatures;
+
   /// A set to remember the functions we already assume to be live and visited.
   DenseSet<const Function *> VisitedFunctions;
 
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -1621,19 +1621,12 @@
   NumFnShallowWrappersCreated++;
 }
 
-/// Make another copy of the function \p F such that the copied version has
-/// internal linkage afterwards and can be analysed. Then we replace all uses
-/// of the original function to the copied one
-///
-/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr`
-/// linkage can be internalized because these linkages guarantee that other
-/// definitions with the same name have the same semantics as this one
-///
-static Function *internalizeFunction(Function &F) {
-  assert(AllowDeepWrapper && "Cannot create a copy if not allowed.");
-  assert(!F.isDeclaration() && !F.hasExactDefinition() &&
-         !GlobalValue::isInterposableLinkage(F.getLinkage()) &&
-         "Trying to internalize function which cannot be internalized.");
+Function *Attributor::internalizeFunction(Function &F, bool Force) {
+  if (!AllowDeepWrapper && !Force)
+    return nullptr;
+  if (F.isDeclaration() || F.hasLocalLinkage() ||
+      GlobalValue::isInterposableLinkage(F.getLinkage()))
+    return nullptr;
 
   Module &M = *F.getParent();
   FunctionType *FnTy = F.getFunctionType();
@@ -1663,7 +1656,8 @@
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   F.getAllMetadata(MDs);
   for (auto MDIt : MDs)
-    Copied->addMetadata(MDIt.first, *MDIt.second);
+    if (!Copied->hasMetadata())
+      Copied->addMetadata(MDIt.first, *MDIt.second);
 
   M.getFunctionList().insert(F.getIterator(), Copied);
   F.replaceAllUsesWith(Copied);
@@ -1675,6 +1669,9 @@
 bool Attributor::isValidFunctionSignatureRewrite(
     Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
 
+  if (!RewriteSignatures)
+    return false;
+
   auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
     // Forbid the call site to cast the function return type. If we need to
     // rewrite these functions we need to re-create a cast for the new call site
@@ -2459,7 +2456,8 @@
       Function *F = Functions[u];
       if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() &&
           !GlobalValue::isInterposableLinkage(F->getLinkage())) {
-        Function *NewF = internalizeFunction(*F);
+        Function *NewF = Attributor::internalizeFunction(*F);
+        assert(NewF && "Could not internalize function.");
         Functions.insert(NewF);
 
         // Update call graph
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1623,9 +1623,9 @@
     };
     GlobalizationRFI.foreachUse(SCC, CreateAA);
 
-    for (auto &F : M) {
-      if (!F.isDeclaration())
-        A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
+    for (auto *F : SCC) {
+      if (!F->isDeclaration())
+        A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
     }
   }
 };
@@ -2620,11 +2620,19 @@
   if (DisableOpenMPOptimizations)
     return PreservedAnalyses::all();
 
-  // Look at every function definition in the Module.
+  // Create internal copies of each function if this is a kernel Module.
+  DenseSet<const Function *> InternalizedFuncs;
+  if (!OMPInModule.getKernels().empty())
+    for (Function &F : M)
+      if (!F.isDeclaration() && !OMPInModule.getKernels().contains(&F))
+        if (Attributor::internalizeFunction(F, /* Force */ true))
+          InternalizedFuncs.insert(&F);
+
+  // Look at every function definition in the Module that wasn't internalized.
   SmallVector<Function *, 16> SCC;
-  for (Function &Fn : M)
-    if (!Fn.isDeclaration())
-      SCC.push_back(&Fn);
+  for (Function &F : M)
+    if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
+      SCC.push_back(&F);
 
   if (SCC.empty())
     return PreservedAnalyses::all();
@@ -2645,7 +2653,7 @@
   OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions,
                                 OMPInModule.getKernels());
 
-  Attributor A(Functions, InfoCache, CGUpdater);
+  Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(true);
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -2,6 +2,8 @@
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
+@S = external local_unnamed_addr global i8*
+
 ; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
 ; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
 
@@ -67,7 +69,7 @@
 define void @use(i8* %x) {
 entry:
   %addr = alloca i8*
-  store i8* %x, i8** %addr
+  store i8* %x, i8** @S
   ret void
 }
 
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -1,8 +1,8 @@
-; RUN: opt -passes=openmp-opt-cgscc -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=openmp-opt -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ; ModuleID = 'single_threaded_exeuction.c'
 
-define void @kernel() {
+define weak void @kernel() {
   call void @__kmpc_kernel_init(i32 512, i16 1)
   call void @nvptx()
   call void @amdgcn()
@@ -12,14 +12,15 @@
 ; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
 ; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define dso_local void @nvptx() {
+; Function Attrs: noinline
+define internal void @nvptx() {
 entry:
   %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %cmp = icmp eq i32 %call, 0
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
+  call void @foo()
   call void @bar()
   br label %if.end
 
@@ -30,14 +31,15 @@
 ; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
 ; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define dso_local void @amdgcn() {
+; Function Attrs: noinline
+define internal void @amdgcn() {
 entry:
   %call = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %call, 0
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:
+  call void @foo()
   call void @bar()
   br label %if.end
 
@@ -45,9 +47,16 @@
   ret void
 }
 
-; CHECK: [openmp-opt] Basic block @bar entry is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define internal void @bar() {
+; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread.
+; Function Attrs: noinline
+define internal void @foo() {
+entry:
+  ret void
+}
+
+; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread.
+; Function Attrs: noinline
+define void @bar() {
 entry:
   ret void
 }