diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -24,6 +24,12 @@
   let name = "nvvm";
   let cppNamespace = "::mlir::NVVM";
   let dependentDialects = ["LLVM::LLVMDialect"];
+
+  let extraClassDeclaration = [{
+    /// Get the name of the attribute used to annotate external kernel
+    /// functions.
+    static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; }
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -24,6 +24,12 @@
   let name = "rocdl";
   let cppNamespace = "::mlir::ROCDL";
   let dependentDialects = ["LLVM::LLVMDialect"];
+
+  let extraClassDeclaration = [{
+    /// Get the name of the attribute used to annotate external kernel
+    /// functions.
+    static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -17,6 +17,7 @@
 add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
   ConvertLaunchFuncToRuntimeCalls.cpp
   ConvertKernelFuncToBlob.cpp
+  GPUOpsLowering.cpp
 
   DEPENDS
   MLIRConversionPassIncGen
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -11,145 +11,26 @@
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Builders.h"
-#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 
-template <unsigned AllocaAddrSpace>
 struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
-  using ConvertOpToLLVMPattern<gpu::GPUFuncOp>::ConvertOpToLLVMPattern;
+  GPUFuncOpLowering(LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+                    Identifier kernelAttributeName)
+      : ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
+        allocaAddrSpace(allocaAddrSpace),
+        kernelAttributeName(kernelAttributeName) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    assert(operands.empty() && "func op is not expected to have operands");
-    Location loc = gpuFuncOp.getLoc();
-
-    SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
-    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-    for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
-      Value attribution = en.value();
-
-      auto type = attribution.getType().dyn_cast<MemRefType>();
-      assert(type && type.hasStaticShape() && "unexpected type in attribution");
-
-      uint64_t numElements = type.getNumElements();
-
-      auto elementType = typeConverter->convertType(type.getElementType())
-                             .template cast<Type>();
-      auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
-      std::string name = std::string(
-          llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
-      auto globalOp = rewriter.create<LLVM::GlobalOp>(
-          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
-          LLVM::Linkage::Internal, name, /*value=*/Attribute(),
-          gpu::GPUDialect::getWorkgroupAddressSpace());
-      workgroupBuffers.push_back(globalOp);
-    }
-
-    // Rewrite the original GPU function to an LLVM function.
-    auto funcType = typeConverter->convertType(gpuFuncOp.getType())
-                        .template cast<LLVM::LLVMPointerType>()
-                        .getElementType();
-
-    // Remap proper input types.
-    TypeConverter::SignatureConversion signatureConversion(
-        gpuFuncOp.front().getNumArguments());
-    getTypeConverter()->convertFunctionSignature(
-        gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
-
-    // Create the new function operation. Only copy those attributes that are
-    // not specific to function modeling.
-    SmallVector<NamedAttribute, 4> attributes;
-    for (const auto &attr : gpuFuncOp.getAttrs()) {
-      if (attr.first == SymbolTable::getSymbolAttrName() ||
-          attr.first == impl::getTypeAttrName() ||
-          attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
-        continue;
-      attributes.push_back(attr);
-    }
-    auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-        gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
-        LLVM::Linkage::External, attributes);
+                  ConversionPatternRewriter &rewriter) const override;
 
-    {
-      // Insert operations that correspond to converted workgroup and private
-      // memory attributions to the body of the function. This must operate on
-      // the original function, before the body region is inlined in the new
-      // function to maintain the relation between block arguments and the
-      // parent operation that assigns their semantics.
-      OpBuilder::InsertionGuard guard(rewriter);
+private:
+  /// The address spcae to use for `alloca`s in private memory.
+  unsigned allocaAddrSpace;
 
-      // Rewrite workgroup memory attributions to addresses of global buffers.
-      rewriter.setInsertionPointToStart(&gpuFuncOp.front());
-      unsigned numProperArguments = gpuFuncOp.getNumArguments();
-      auto i32Type = IntegerType::get(rewriter.getContext(), 32);
-
-      Value zero = nullptr;
-      if (!workgroupBuffers.empty())
-        zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
-                                                 rewriter.getI32IntegerAttr(0));
-      for (auto en : llvm::enumerate(workgroupBuffers)) {
-        LLVM::GlobalOp global = en.value();
-        Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
-        auto elementType =
-            global.getType().cast<LLVM::LLVMArrayType>().getElementType();
-        Value memory = rewriter.create<LLVM::GEPOp>(
-            loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
-            address, ArrayRef<Value>{zero, zero});
-
-        // Build a memref descriptor pointing to the buffer to plug with the
-        // existing memref infrastructure. This may use more registers than
-        // otherwise necessary given that memref sizes are fixed, but we can try
-        // and canonicalize that away later.
-        Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
-        auto type = attribution.getType().cast<MemRefType>();
-        auto descr = MemRefDescriptor::fromStaticShape(
-            rewriter, loc, *getTypeConverter(), type, memory);
-        signatureConversion.remapInput(numProperArguments + en.index(), descr);
-      }
-
-      // Rewrite private memory attributions to alloca'ed buffers.
-      unsigned numWorkgroupAttributions =
-          gpuFuncOp.getNumWorkgroupAttributions();
-      auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
-      for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
-        Value attribution = en.value();
-        auto type = attribution.getType().cast<MemRefType>();
-        assert(type && type.hasStaticShape() &&
-               "unexpected type in attribution");
-
-        // Explicitly drop memory space when lowering private memory
-        // attributions since NVVM models it as `alloca`s in the default
-        // memory space and does not support `alloca`s with addrspace(5).
-        auto ptrType = LLVM::LLVMPointerType::get(
-            typeConverter->convertType(type.getElementType())
-                .template cast<Type>(),
-            AllocaAddrSpace);
-        Value numElements = rewriter.create<LLVM::ConstantOp>(
-            gpuFuncOp.getLoc(), int64Ty,
-            rewriter.getI64IntegerAttr(type.getNumElements()));
-        Value allocated = rewriter.create<LLVM::AllocaOp>(
-            gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
-        auto descr = MemRefDescriptor::fromStaticShape(
-            rewriter, loc, *getTypeConverter(), type, allocated);
-        signatureConversion.remapInput(
-            numProperArguments + numWorkgroupAttributions + en.index(), descr);
-      }
-    }
-
-    // Move the region to the new function, update the entry block signature.
-    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
-                                llvmFuncOp.end());
-    if (failed(rewriter.convertRegionTypes(
-            &llvmFuncOp.getBody(), *typeConverter, &signatureConversion)))
-      return failure();
-
-    rewriter.eraseOp(gpuFuncOp);
-    return success();
-  }
+  /// The attribute name to use instead of `gpu.kernel`.
+  Identifier kernelAttributeName;
 };
 
 struct GPUReturnOpLowering : public ConvertOpToLLVMPattern<gpu::ReturnOp> {
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -0,0 +1,148 @@
+//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GPUOpsLowering.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+
+LogicalResult
+GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+                                   ArrayRef<Value> operands,
+                                   ConversionPatternRewriter &rewriter) const {
+  assert(operands.empty() && "func op is not expected to have operands");
+  Location loc = gpuFuncOp.getLoc();
+
+  SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
+  workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+  for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+    Value attribution = en.value();
+
+    auto type = attribution.getType().dyn_cast<MemRefType>();
+    assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+    uint64_t numElements = type.getNumElements();
+
+    auto elementType =
+        typeConverter->convertType(type.getElementType()).template cast<Type>();
+    auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
+    std::string name = std::string(
+        llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
+    auto globalOp = rewriter.create<LLVM::GlobalOp>(
+        gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+        LLVM::Linkage::Internal, name, /*value=*/Attribute(),
+        gpu::GPUDialect::getWorkgroupAddressSpace());
+    workgroupBuffers.push_back(globalOp);
+  }
+
+  // Rewrite the original GPU function to an LLVM function.
+  auto funcType = typeConverter->convertType(gpuFuncOp.getType())
+                      .template cast<LLVM::LLVMPointerType>()
+                      .getElementType();
+
+  // Remap proper input types.
+  TypeConverter::SignatureConversion signatureConversion(
+      gpuFuncOp.front().getNumArguments());
+  getTypeConverter()->convertFunctionSignature(
+      gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
+
+  // Create the new function operation. Only copy those attributes that are
+  // not specific to function modeling.
+  SmallVector<NamedAttribute, 4> attributes;
+  for (const auto &attr : gpuFuncOp.getAttrs()) {
+    if (attr.first == SymbolTable::getSymbolAttrName() ||
+        attr.first == impl::getTypeAttrName() ||
+        attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
+      continue;
+    attributes.push_back(attr);
+  }
+  // Add a dialect specific kernel attribute in addition to GPU kernel
+  // attribute. The former is necessary for further translation while the
+  // latter is expected by gpu.launch_func.
+  if (gpuFuncOp.isKernel())
+    attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+  auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
+      gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
+      LLVM::Linkage::External, attributes);
+
+  {
+    // Insert operations that correspond to converted workgroup and private
+    // memory attributions to the body of the function. This must operate on
+    // the original function, before the body region is inlined in the new
+    // function to maintain the relation between block arguments and the
+    // parent operation that assigns their semantics.
+    OpBuilder::InsertionGuard guard(rewriter);
+
+    // Rewrite workgroup memory attributions to addresses of global buffers.
+    rewriter.setInsertionPointToStart(&gpuFuncOp.front());
+    unsigned numProperArguments = gpuFuncOp.getNumArguments();
+    auto i32Type = IntegerType::get(rewriter.getContext(), 32);
+
+    Value zero = nullptr;
+    if (!workgroupBuffers.empty())
+      zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
+                                               rewriter.getI32IntegerAttr(0));
+    for (auto en : llvm::enumerate(workgroupBuffers)) {
+      LLVM::GlobalOp global = en.value();
+      Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
+      auto elementType =
+          global.getType().cast<LLVM::LLVMArrayType>().getElementType();
+      Value memory = rewriter.create<LLVM::GEPOp>(
+          loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
+          address, ArrayRef<Value>{zero, zero});
+
+      // Build a memref descriptor pointing to the buffer to plug with the
+      // existing memref infrastructure. This may use more registers than
+      // otherwise necessary given that memref sizes are fixed, but we can try
+      // and canonicalize that away later.
+      Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
+      auto type = attribution.getType().cast<MemRefType>();
+      auto descr = MemRefDescriptor::fromStaticShape(
+          rewriter, loc, *getTypeConverter(), type, memory);
+      signatureConversion.remapInput(numProperArguments + en.index(), descr);
+    }
+
+    // Rewrite private memory attributions to alloca'ed buffers.
+    unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
+    auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
+    for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
+      Value attribution = en.value();
+      auto type = attribution.getType().cast<MemRefType>();
+      assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+      // Explicitly drop memory space when lowering private memory
+      // attributions since NVVM models it as `alloca`s in the default
+      // memory space and does not support `alloca`s with addrspace(5).
+      auto ptrType = LLVM::LLVMPointerType::get(
+          typeConverter->convertType(type.getElementType())
+              .template cast<Type>(),
+          allocaAddrSpace);
+      Value numElements = rewriter.create<LLVM::ConstantOp>(
+          gpuFuncOp.getLoc(), int64Ty,
+          rewriter.getI64IntegerAttr(type.getNumElements()));
+      Value allocated = rewriter.create<LLVM::AllocaOp>(
+          gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
+      auto descr = MemRefDescriptor::fromStaticShape(
+          rewriter, loc, *getTypeConverter(), type, allocated);
+      signatureConversion.remapInput(
+          numProperArguments + numWorkgroupAttributions + en.index(), descr);
+    }
+  }
+
+  // Move the region to the new function, update the entry block signature.
+  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
+                              llvmFuncOp.end());
+  if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
+                                         &signatureConversion)))
+    return failure();
+
+  rewriter.eraseOp(gpuFuncOp);
+  return success();
+}
diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@@ -11,6 +11,7 @@
 
   LINK_LIBS PUBLIC
   MLIRGPU
+  MLIRGPUToGPURuntimeTransforms
   MLIRLLVMIR
   MLIRNVVMIR
   MLIRPass
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -167,11 +167,16 @@
                                           NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
               GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
                                           NVVM::GridDimYOp, NVVM::GridDimZOp>,
-              GPUShuffleOpLowering, GPUReturnOpLowering,
-              // Explicitly drop memory space when lowering private memory
-              // attributions since NVVM models it as `alloca`s in the default
-              // memory space and does not support `alloca`s with addrspace(5).
-              GPUFuncOpLowering<0>>(converter);
+              GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
+
+  // Explicitly drop memory space when lowering private memory
+  // attributions since NVVM models it as `alloca`s in the default
+  // memory space and does not support `alloca`s with addrspace(5).
+  patterns.insert<GPUFuncOpLowering>(
+      converter, /*allocaAddrSpace=*/0,
+      Identifier::get(NVVM::NVVMDialect::getKernelFuncAttrName(),
+                      &converter.getContext()));
+
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
                                                 "__nv_fabs");
   patterns.insert<OpToFuncCallLowering<math::AtanOp>>(converter, "__nv_atanf",
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -11,6 +11,7 @@
 
   LINK_LIBS PUBLIC
   MLIRGPU
+  MLIRGPUToGPURuntimeTransforms
   MLIRLLVMIR
   MLIRROCDLIR
   MLIRPass
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -103,7 +103,11 @@
                                   ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
       GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
                                   ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
-      GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
+      GPUReturnOpLowering>(converter);
+  patterns.insert<GPUFuncOpLowering>(
+      converter, /*allocaAddrSpace=*/5,
+      Identifier::get(ROCDL::ROCDLDialect::getKernelFuncAttrName(),
+                      &converter.getContext()));
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
                                                 "__ocml_fabs_f64");
   patterns.insert<OpToFuncCallLowering<math::AtanOp>>(
diff --git a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
--- a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
@@ -77,7 +77,8 @@
   // function as a kernel.
   for (auto func :
        ModuleTranslation::getModuleBody(m).getOps<LLVM::LLVMFuncOp>()) {
-    if (!gpu::GPUDialect::isKernel(func))
+    if (!func->getAttrOfType<UnitAttr>(
+            NVVM::NVVMDialect::getKernelFuncAttrName()))
       continue;
 
     auto *llvmFunc = llvmModule->getFunction(func.getName());
diff --git a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
--- a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
@@ -87,7 +87,7 @@
   for (auto func :
        ModuleTranslation::getModuleBody(m).getOps<LLVM::LLVMFuncOp>()) {
     if (!func->getAttrOfType<UnitAttr>(
-            gpu::GPUDialect::getKernelFuncAttrName()))
+            ROCDL::ROCDLDialect::getKernelFuncAttrName()))
       continue;
 
     auto *llvmFunc = llvmModule->getFunction(func.getName());
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -423,3 +423,15 @@
     std.return %result32, %result64 : f32, f64
   }
 }
+
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: @kernel_func
+  // CHECK: attributes
+  // CHECK: gpu.kernel
+  // CHECK: nvvm.kernel
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -340,3 +340,15 @@
     std.return %result32, %result64 : f32, f64
   }
 }
+
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: @kernel_func
+  // CHECK: attributes
+  // CHECK: gpu.kernel
+  // CHECK: rocdl.kernel
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
diff --git a/mlir/test/Target/nvvmir.mlir b/mlir/test/Target/nvvmir.mlir
--- a/mlir/test/Target/nvvmir.mlir
+++ b/mlir/test/Target/nvvmir.mlir
@@ -75,7 +75,7 @@
 
 // This function has the "kernel" attribute attached and should appear in the
 // NVVM annotations after conversion.
-llvm.func @kernel_func() attributes {gpu.kernel} {
+llvm.func @kernel_func() attributes {nvvm.kernel} {
   llvm.return
 }
 
diff --git a/mlir/test/Target/rocdl.mlir b/mlir/test/Target/rocdl.mlir
--- a/mlir/test/Target/rocdl.mlir
+++ b/mlir/test/Target/rocdl.mlir
@@ -29,7 +29,7 @@
   llvm.return %1 : i32
 }
 
-llvm.func @kernel_func() attributes {gpu.kernel} {
+llvm.func @kernel_func() attributes {rocdl.kernel} {
   // CHECK-LABEL: amdgpu_kernel void @kernel_func
   llvm.return
 }