diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_GPU_PASSES_H_
 #define MLIR_DIALECT_GPU_PASSES_H_
 
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Pass/Pass.h"
 
@@ -25,7 +26,8 @@
 namespace mlir {
 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
 /// a separate kernel function.
-std::unique_ptr<OperationPass<ModuleOp>> createGpuKernelOutliningPass();
+std::unique_ptr<OperationPass<ModuleOp>> createGpuKernelOutliningPass(
+    DataLayoutSpecInterface dataLayoutSpec = DataLayoutSpecAttr());
 
 /// Rewrites a function region so that GPU ops execute asynchronously.
 std::unique_ptr<OperationPass<FuncOp>> createGpuAsyncRegionPass();
diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td
--- a/mlir/include/mlir/Dialect/GPU/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Passes.td
@@ -14,6 +14,7 @@
 def GpuKernelOutlining : Pass<"gpu-kernel-outlining", "ModuleOp"> {
   let summary = "Outline gpu.launch bodies to kernel functions";
   let constructor = "mlir::createGpuKernelOutliningPass()";
+  let dependentDialects = ["mlir::DLTIDialect"];
 }
 
 def GpuAsyncRegionPass : FunctionPass<"gpu-async-region"> {
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -12,6 +12,7 @@
 
 #include "PassDetail.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/GPU/Utils.h"
@@ -227,6 +228,17 @@
 }
 
 namespace {
+
+/// Return a data layout specification attribute with an entry that sets the
+/// Index type size to 32-bit.
+DataLayoutSpecAttr create32bitIndexTypeDataLayout(MLIRContext *context) {
+  auto intAttr32 = IntegerAttr::get(
+      IntegerType::get(context, /*width=*/32, IntegerType::Signless),
+      /*value=*/32);
+  return DataLayoutSpecAttr::get(
+      context, {DataLayoutEntryAttr::get(IndexType::get(context), intAttr32)});
+}
+
 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
 ///
 /// This pass moves the kernel code of each LaunchOp into a function created
@@ -239,7 +251,21 @@
 class GpuKernelOutliningPass
     : public GpuKernelOutliningBase<GpuKernelOutliningPass> {
 public:
+  GpuKernelOutliningPass(DataLayoutSpecInterface dlSpec)
+      : dataLayoutSpec(dlSpec) {}
+
+  GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
+      : dataLayoutSpec(other.dataLayoutSpec) {
+    test32bitIndexTypeDL = other.test32bitIndexTypeDL;
+  }
+
   void runOnOperation() override {
+    // If the test flag was provided, overwrites the data layout specification
+    // with a new one that sets the Index type size to 32-bit.
+    if (test32bitIndexTypeDL)
+      dataLayoutSpec =
+          create32bitIndexTypeDataLayout(getOperation().getContext());
+
     SymbolTable symbolTable(getOperation());
     bool modified = false;
     for (auto func : getOperation().getOps<FuncOp>()) {
@@ -278,6 +304,14 @@
                               UnitAttr::get(&getContext()));
   }
 
+  Option<bool> test32bitIndexTypeDL{
+      *this, "test-32bit-index-type-dl",
+      llvm::cl::desc(
+          "Create a kernel module and attach a data layout specification with "
+          "index type set to 32 bits. This flag is only intended to be used "
+          "for testing purposes"),
+      llvm::cl::init(false)};
+
 private:
   /// Returns a gpu.module containing kernelFunc and all callees (recursive).
   gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
@@ -290,6 +324,12 @@
     OpBuilder builder(context);
     auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
                                                          kernelFunc.getName());
+
+    // If a valid data layout spec was provided, attach it to the kernel module.
+    // Otherwise, the default data layout will be used.
+    if (dataLayoutSpec)
+      kernelModule->setAttr("dlspec", dataLayoutSpec);
+
     SymbolTable symbolTable(kernelModule);
     symbolTable.insert(kernelFunc);
 
@@ -313,10 +353,13 @@
 
     return kernelModule;
   }
+
+  DataLayoutSpecInterface dataLayoutSpec;
 };
 
 } // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> mlir::createGpuKernelOutliningPass() {
-  return std::make_unique<GpuKernelOutliningPass>();
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createGpuKernelOutliningPass(DataLayoutSpecInterface dataLayoutSpec) {
+  return std::make_unique<GpuKernelOutliningPass>(dataLayoutSpec);
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
--- a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
@@ -10,6 +10,7 @@
 #define DIALECT_GPU_TRANSFORMS_PASSDETAIL_H_
 
 #include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining=test-32bit-index-type-dl -split-input-file %s | FileCheck --check-prefix CHECK-DL %s
 
 // CHECK: module attributes {gpu.container_module}
 
@@ -35,8 +36,9 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
 
-// CHECK-LABEL: module @launch_kernel
+// CHECK-LABEL: gpu.module @launch_kernel
 // CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
@@ -81,7 +83,10 @@
   return
 }
 
-// CHECK: module @multiple_launches_kernel
+// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// CHECK: gpu.module @multiple_launches_kernel
 // CHECK: func @multiple_launches_kernel
 // CHECK: module @multiple_launches_kernel_0
 // CHECK: func @multiple_launches_kernel
@@ -106,6 +111,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
 // CHECK: arith.constant 2
 
@@ -130,6 +137,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_kernel(
 // CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
 // CHECK: arith.constant 2
@@ -158,6 +167,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_noarg_kernel(
 // CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
 // CHECK: %[[KCST:.*]] = arith.constant 2
@@ -186,6 +197,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // -----
 
 // CHECK-LABEL: @multiple_uses2
@@ -213,6 +226,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // -----
 
 llvm.mlir.global internal @global(42 : i64) : i64
@@ -242,6 +257,8 @@
   return
 }
 
+// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK: gpu.module @function_call_kernel {
 // CHECK:   gpu.func @function_call_kernel()
 // CHECK:     call @device_function() : () -> ()
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3000,6 +3000,7 @@
     deps = [
         ":ArithmeticDialect",
         ":Async",
+        ":DLTIDialect",
         ":GPUDialect",
         ":GPUPassIncGen",
         ":MemRefDialect",