diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
new file mode 100644
--- /dev/null
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1                                 -verify=host                                                              -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify=all,safe                                                          -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe                                                          -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+
+// host-no-diagnostics
+
+void bar1(void) {
+#pragma omp parallel // #0
+                     // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // safe-remark@#0 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+                     // force-remark@#0 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}}
+                     // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
+  {
+  }
+}
+void bar2(void) {
+#pragma omp parallel // #1
+                     // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // safe-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+                     // force-remark@#1 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}}
+                     // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
+  {
+  }
+}
+
+void foo1(void) {
+#pragma omp target teams // #2
+                         // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
+                         // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #3
+                     // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+#pragma omp parallel // #4
+                     // all-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+  }
+}
+
+void foo2(void) {
+#pragma omp target teams // #5
+                         // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading_22}}
+                         // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #6
+                     // all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#6 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+#pragma omp parallel // #7
+                     // all-remark@#7 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#7 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+  }
+}
+
+void foo3(void) {
+#pragma omp target teams // #8
+                         // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading_22}}
+                         // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #9
+                     // all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#9 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+#pragma omp parallel // #10
+                     // all-remark@#10 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
+                     // all-remark@#10 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar1();
+    bar2();
+  }
+}
+
+void spmd(void) {
+  // Verify we do not emit the remarks above for "SPMD" regions.
+#pragma omp target teams
+#pragma omp parallel
+  {
+  }
+
+#pragma omp target teams distribute parallel for
+  for (int i = 0; i < 100; ++i) {
+  }
+}
+
+// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num moved to}}
+// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
new file mode 100644
--- /dev/null
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1                                 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify      -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+
+// host-no-diagnostics
+
+void bar(void) {
+#pragma omp parallel // #1                                                                                                                                                                                                                                                                                                                                           \
+                     // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
+                     // expected-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
+  {
+  }
+}
+
+void foo(void) {
+#pragma omp target teams // #2                                                                                                                                                                      \
+                         // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} \
+                         // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+  {
+#pragma omp parallel // #3                                                                                                                                                                                                                                                                                                                                           \
+                     // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
+                     // expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+    bar();
+#pragma omp parallel // #4                                                                                                                                                                                                                                                                                                                                           \
+                     // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
+                     // expected-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
+    {
+    }
+  }
+}
+
+void spmd(void) {
+  // Verify we do not emit the remarks above for "SPMD" regions.
+#pragma omp target teams
+#pragma omp parallel
+  {
+  }
+
+#pragma omp target teams distribute parallel for
+  for (int i = 0; i < 100; ++i) {
+  }
+}
+
+// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}}
+// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1033,6 +1033,7 @@
     // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
     // all.
     bool UnknownUse = false;
+    bool KernelPrepareUse = false;
     unsigned NumDirectCalls = 0;
 
     SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
@@ -1047,33 +1048,92 @@
         ToBeReplacedStateMachineUses.push_back(&U);
         return;
       }
-      if (OpenMPOpt::getCallIfRegularCall(*U.getUser(),
-                                          &KernelPrepareParallelRFI)) {
+      if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
+                                   *U.getUser(), &KernelPrepareParallelRFI)) {
+        KernelPrepareUse = true;
         ToBeReplacedStateMachineUses.push_back(&U);
         return;
       }
       UnknownUse = true;
     });
 
-    // If this ever hits, we should investigate.
-    if (UnknownUse || NumDirectCalls != 1)
+    // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
+    // use.
+    if (!KernelPrepareUse)
       continue;
 
-    // TODO: This is not a necessary restriction and should be lifted.
-    if (ToBeReplacedStateMachineUses.size() != 2)
+    {
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Found a parallel region that is called in a target "
+                     "region but not part of a combined target construct nor "
+                     "nesed inside a target construct without intermediate "
+                     "code. This can lead to excessive register usage for "
+                     "unrelated target regions in the same translation unit "
+                     "due to spurious call edges assumed by ptxas.";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+    }
+
+    // If this ever hits, we should investigate.
+    // TODO: Checking the number of uses is not a necessary restriction and
+    // should be lifted.
+    if (UnknownUse || NumDirectCalls != 1 ||
+        ToBeReplacedStateMachineUses.size() != 2) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is used in "
+                    << (UnknownUse ? "unknown" : "unexpected")
+                    << " ways; will not attempt to rewrite the state machine.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+      }
       continue;
+    }
 
     // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
     // up if the function is not called from a unique kernel.
     Kernel K = getUniqueKernelFor(*F);
-    if (!K)
+    if (!K) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is not known to be called from a "
+                       "unique single target region, maybe the surrounding "
+                       "function has external linkage?; will not attempt to "
+                       "rewrite the state machine use.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
+                             Remark);
+      }
       continue;
+    }
 
     // We now know F is a parallel body function called only from the kernel K.
     // We also identified the state machine uses in which we replace the
     // function pointer by a new global symbol for identification purposes. This
     // ensures only direct calls to the function are left.
 
+    {
+      auto RemarkParalleRegion = [&](OptimizationRemark OR) {
+        return OR << "Specialize parallel region that is only reached from a "
+                     "single target region to avoid spurious call edges and "
+                     "excessive register usage in other target regions. "
+                     "(parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
+                           RemarkParalleRegion);
+      auto RemarkKernel = [&](OptimizationRemark OR) {
+        return OR << "Target region containing the parallel region that is "
+                     "specialized. (parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
+    }
+
     Module &M = *F->getParent();
     Type *Int8Ty = Type::getInt8Ty(M.getContext());