diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -25,7 +25,8 @@
 namespace mlir {
 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
 /// a separate kernel function.
-std::unique_ptr<OperationPass<ModuleOp>> createGpuKernelOutliningPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());
 
 /// Rewrites a function region so that GPU ops execute asynchronously.
 std::unique_ptr<OperationPass<FuncOp>> createGpuAsyncRegionPass();
diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td
--- a/mlir/include/mlir/Dialect/GPU/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Passes.td
@@ -14,6 +14,7 @@
 def GpuKernelOutlining : Pass<"gpu-kernel-outlining", "ModuleOp"> {
   let summary = "Outline gpu.launch bodies to kernel functions";
   let constructor = "mlir::createGpuKernelOutliningPass()";
+  let dependentDialects = ["mlir::DLTIDialect"];
 }
 
 def GpuAsyncRegionPass : FunctionPass<"gpu-async-region"> {
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -12,6 +12,7 @@
 
 #include "PassDetail.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/GPU/Utils.h"
@@ -20,6 +21,7 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"
 
@@ -239,6 +241,31 @@
 class GpuKernelOutliningPass
     : public GpuKernelOutliningBase<GpuKernelOutliningPass> {
 public:
+  GpuKernelOutliningPass(StringRef dlStr) {
+    if (!dlStr.empty() && !dataLayoutStr.hasValue())
+      dataLayoutStr = dlStr.str();
+  }
+
+  GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
+      : dataLayoutSpec(other.dataLayoutSpec) {
+    dataLayoutStr = other.dataLayoutStr;
+  }
+
+  LogicalResult initialize(MLIRContext *context) override {
+    // Initialize the data layout specification from the data layout string.
+    if (!dataLayoutStr.empty()) {
+      Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
+      if (!resultAttr)
+        return failure();
+
+      dataLayoutSpec = resultAttr.dyn_cast<DataLayoutSpecInterface>();
+      if (!dataLayoutSpec)
+        return failure();
+    }
+
+    return success();
+  }
+
   void runOnOperation() override {
     SymbolTable symbolTable(getOperation());
     bool modified = false;
@@ -290,6 +317,12 @@
     OpBuilder builder(context);
     auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
                                                          kernelFunc.getName());
+
+    // If a valid data layout spec was provided, attach it to the kernel module.
+    // Otherwise, the default data layout will be used.
+    if (dataLayoutSpec)
+      kernelModule->setAttr("dlspec", dataLayoutSpec);
+
     SymbolTable symbolTable(kernelModule);
     symbolTable.insert(kernelFunc);
 
@@ -313,10 +346,18 @@
 
     return kernelModule;
   }
+
+  Option<std::string> dataLayoutStr{
+      *this, "data-layout-str",
+      llvm::cl::desc("String containing the data layout specification to be "
+                     "attached to the GPU kernel module")};
+
+  DataLayoutSpecInterface dataLayoutSpec;
 };
 
 } // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> mlir::createGpuKernelOutliningPass() {
-  return std::make_unique<GpuKernelOutliningPass>();
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
+  return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
--- a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
@@ -10,6 +10,7 @@
 #define DIALECT_GPU_TRANSFORMS_PASSDETAIL_H_
 
 #include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining=data-layout-str='#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>' -split-input-file %s | FileCheck --check-prefix CHECK-DL %s
 
 // CHECK: module attributes {gpu.container_module}
 
@@ -35,8 +36,9 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
 
-// CHECK-LABEL: module @launch_kernel
+// CHECK-LABEL: gpu.module @launch_kernel
 // CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
@@ -81,7 +83,10 @@
   return
 }
 
-// CHECK: module @multiple_launches_kernel
+// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// CHECK: gpu.module @multiple_launches_kernel
 // CHECK: func @multiple_launches_kernel
 // CHECK: module @multiple_launches_kernel_0
 // CHECK: func @multiple_launches_kernel
@@ -106,6 +111,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
 // CHECK: arith.constant 2
 
@@ -130,6 +137,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_kernel(
 // CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
 // CHECK: arith.constant 2
@@ -158,6 +167,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_noarg_kernel(
 // CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
 // CHECK: %[[KCST:.*]] = arith.constant 2
@@ -186,6 +197,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // -----
 
 // CHECK-LABEL: @multiple_uses2
@@ -213,6 +226,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // -----
 
 llvm.mlir.global internal @global(42 : i64) : i64
@@ -242,6 +257,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK: gpu.module @function_call_kernel {
 // CHECK:   gpu.func @function_call_kernel()
 // CHECK:     call @device_function() : () -> ()
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3000,6 +3000,7 @@
     deps = [
         ":ArithmeticDialect",
         ":Async",
+        ":DLTIDialect",
         ":GPUDialect",
         ":GPUPassIncGen",
         ":MemRefDialect",