diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -408,6 +408,7 @@
 def TargetX86 : TargetArch<["x86"]>;
 def TargetAnyX86 : TargetArch<["x86", "x86_64"]>;
 def TargetWebAssembly : TargetArch<["wasm32", "wasm64"]>;
+def TargetNVPTX : TargetArch<["nvptx", "nvptx64"]>;
 def TargetWindows : TargetSpec {
   let OSes = ["Win32"];
 }
@@ -1194,10 +1195,9 @@
 def : MutualExclusions<[CUDADeviceBuiltinSurfaceType,
                         CUDADeviceBuiltinTextureType]>;
 
-def CUDAGlobal : InheritableAttr {
-  let Spellings = [GNU<"global">, Declspec<"__global__">];
+def CUDAGlobal : InheritableAttr, TargetSpecificAttr<TargetNVPTX> {
+  let Spellings = [GNU<"global">, Declspec<"__global__">, Clang<"nvptx_kernel">];
   let Subjects = SubjectList<[Function]>;
-  let LangOpts = [CUDA];
   let Documentation = [Undocumented];
 }
 def : MutualExclusions<[CUDADevice, CUDAGlobal]>;
@@ -1225,10 +1225,9 @@
   let Documentation = [InternalOnly];
 }
 
-def CUDALaunchBounds : InheritableAttr {
-  let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">];
+def CUDALaunchBounds : InheritableAttr, TargetSpecificAttr<TargetNVPTX> {
+  let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">, Clang<"nvptx_launch_bounds">];
   let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>];
-  let LangOpts = [CUDA];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   // An AST node is created for this attribute, but is not used by other parts
   // of the compiler. However, this node needs to exist in the AST because
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1411,7 +1411,7 @@
     if (FD &&
         FD->getType()->castAs<FunctionType>()->getCallConv() == CC_X86RegCall) {
       Out << "__regcall3__" << II->getName();
-    } else if (FD && FD->hasAttr<CUDAGlobalAttr>() &&
+    } else if (FD && CGM.getLangOpts().CUDA && FD->hasAttr<CUDAGlobalAttr>() &&
                GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
       Out << "__device_stub__" << II->getName();
     } else {
@@ -1547,6 +1547,7 @@
   // device-mangling in host compilation could help catching certain ones.
   assert(!isa<FunctionDecl>(ND) || !ND->hasAttr<CUDAGlobalAttr>() ||
          getContext().shouldExternalize(ND) || getLangOpts().CUDAIsDevice ||
+         !getLangOpts().CUDA ||
          (getContext().getAuxTargetInfo() &&
           (getContext().getAuxTargetInfo()->getCXXABI() !=
            getContext().getTargetInfo().getCXXABI())) ||
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -7328,32 +7328,29 @@
     }
   }
 
-  // Perform special handling in CUDA mode.
-  if (M.getLangOpts().CUDA) {
-    // CUDA __global__ functions get a kernel metadata entry.  Since
-    // __global__ functions cannot be called from the device, we do not
-    // need to set the noinline attribute.
-    if (FD->hasAttr<CUDAGlobalAttr>()) {
-      // Create !{<func-ref>, metadata !"kernel", i32 1} node
-      addNVVMMetadata(F, "kernel", 1);
-    }
-    if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>()) {
-      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
-      llvm::APSInt MaxThreads(32);
-      MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(M.getContext());
-      if (MaxThreads > 0)
-        addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue());
-
-      // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was
-      // not specified in __launch_bounds__ or if the user specified a 0 value,
-      // we don't have to add a PTX directive.
-      if (Attr->getMinBlocks()) {
-        llvm::APSInt MinBlocks(32);
-        MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(M.getContext());
-        if (MinBlocks > 0)
-          // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
-          addNVVMMetadata(F, "minctasm", MinBlocks.getExtValue());
-      }
+  // CUDA __global__ functions get a kernel metadata entry.  Since
+  // __global__ functions cannot be called from the device, we do not
+  // need to set the noinline attribute.
+  if (FD->hasAttr<CUDAGlobalAttr>()) {
+    // Create !{<func-ref>, metadata !"kernel", i32 1} node
+    addNVVMMetadata(F, "kernel", 1);
+  }
+  if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>()) {
+    // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
+    llvm::APSInt MaxThreads(32);
+    MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(M.getContext());
+    if (MaxThreads > 0)
+      addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue());
+
+    // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was
+    // not specified in __launch_bounds__ or if the user specified a 0 value,
+    // we don't have to add a PTX directive.
+    if (Attr->getMinBlocks()) {
+      llvm::APSInt MinBlocks(32);
+      MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(M.getContext());
+      if (MinBlocks > 0)
+        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
+        addNVVMMetadata(F, "minctasm", MinBlocks.getExtValue());
     }
   }
 }
diff --git a/clang/test/CodeGen/nvptx_attributes.c b/clang/test/CodeGen/nvptx_attributes.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/nvptx_attributes.c
@@ -0,0 +1,51 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_61 -emit-llvm %s -o - | FileCheck %s
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@device
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+int device() {return 1;};
+
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo
+// CHECK-SAME: (ptr noundef [[RET:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RET_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[RET]], ptr [[RET_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @device()
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[CALL]], ptr [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+__attribute__((nvptx_kernel)) void foo(int *ret) {
+  *ret = device();
+}
+
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar
+// CHECK-SAME: (ptr noundef [[RET:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RET_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[RET]], ptr [[RET_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @device()
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[CALL]], ptr [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+__attribute__((nvptx_kernel, nvptx_launch_bounds(1, 128))) void bar(int *ret) {
+  *ret = device();
+}
+
+
+//.
+// CHECK: attributes #0 = { noinline nounwind optnone "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" }
+//.
+// CHECK: !0 = !{ptr @foo, !"kernel", i32 1}
+// CHECK: !1 = !{ptr @bar, !"kernel", i32 1}
+// CHECK: !2 = !{ptr @bar, !"maxntidx", i32 1}
+// CHECK: !3 = !{ptr @bar, !"minctasm", i32 128}
+// CHECK: !4 = !{i32 1, !"wchar_size", i32 4}
+// CHECK: !5 = !{!"clang version 16.0.0"}
+//.