diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td
--- a/mlir/include/mlir/Dialect/GPU/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td
@@ -33,9 +33,6 @@
     /// functions.
     static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
 
-    /// Get the name of the attribute used to annotate kernel modules.
-    static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
-
     /// Returns whether the given function is a kernel function, i.e., has the
     /// 'gpu.kernel' attribute.
     static bool isKernel(Operation *op);
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -85,7 +85,8 @@
   }];
 }
 
-def GPU_GPUFuncOp : GPU_Op<"func", [AutomaticAllocationScope, FunctionLike,
+def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">,
+                                    AutomaticAllocationScope, FunctionLike,
                                     IsolatedFromAbove, Symbol]> {
   let summary = "Function executable on a GPU";
 
diff --git a/mlir/test/Dialect/GPU/all-reduce-max.mlir b/mlir/test/Dialect/GPU/all-reduce-max.mlir
--- a/mlir/test/Dialect/GPU/all-reduce-max.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-max.mlir
@@ -1,8 +1,8 @@
 // RUN: mlir-opt -test-all-reduce-lowering %s | FileCheck %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// CHECK: module @kernels attributes {gpu.kernel_module} {
-module @kernels attributes {gpu.kernel_module} {
+// CHECK: gpu.module @kernels {
+gpu.module @kernels {
 
   // CHECK-LABEL: gpu.func @kernel(
   // CHECK-SAME: [[VAL_0:%.*]]: f32) workgroup([[VAL_1:%.*]] : memref<32xf32, 3>) kernel {
diff --git a/mlir/test/Dialect/GPU/all-reduce.mlir b/mlir/test/Dialect/GPU/all-reduce.mlir
--- a/mlir/test/Dialect/GPU/all-reduce.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce.mlir
@@ -1,8 +1,8 @@
 // RUN: mlir-opt -test-all-reduce-lowering %s | FileCheck %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// CHECK: module @kernels attributes {gpu.kernel_module} {
-module @kernels attributes {gpu.kernel_module} {
+// CHECK: gpu.module @kernels {
+gpu.module @kernels {
 
   // CHECK-LABEL: gpu.func @kernel(
   // CHECK-SAME: [[VAL_0:%.*]]: f32) workgroup([[VAL_1:%.*]] : memref<32xf32, 3>) kernel {
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -109,6 +109,17 @@
 
 // -----
 
+module attributes {gpu.container_module} {
+  module @kernels {
+    // expected-error@+1 {{'gpu.func' op expects parent op 'gpu.module'}}
+    gpu.func @kernel_1(%arg1 : !llvm<"float*">) {
+      gpu.return
+    }
+  }
+}
+
+// -----
+
 module attributes {gpu.container_module} {
   module @kernels {
   }
@@ -341,7 +352,7 @@
 // -----
 
 module {
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
     // expected-error @+1 {{requires 'type' attribute of function type}}
     "gpu.func"() ({
       gpu.return
@@ -352,7 +363,7 @@
 // -----
 
 module {
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
     // expected-error @+1 {{expected memref type in attribution}}
     gpu.func @kernel() workgroup(%0: i32) {
       gpu.return
@@ -363,7 +374,7 @@
 // -----
 
 module {
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
     // expected-error @+1 {{expected memory space 3 in attribution}}
     gpu.func @kernel() workgroup(%0: memref<4xf32>) {
       gpu.return
@@ -374,7 +385,7 @@
 // -----
 
 module {
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
     // expected-error @+1 {{expected memory space 5 in attribution}}
     gpu.func @kernel() private(%0: memref<4xf32>) {
       gpu.return
@@ -385,7 +396,7 @@
 // -----
 
 module {
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
     // expected-error @+1 {{expected memory space 5 in attribution}}
     gpu.func @kernel() private(%0: memref<4xf32>) {
       gpu.return
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -83,7 +83,7 @@
     return
   }
 
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
     // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
     // CHECK:       workgroup
     // CHECK:       private
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
--- a/mlir/test/Dialect/GPU/promotion.mlir
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -1,119 +1,128 @@
 // RUN: mlir-opt -allow-unregistered-dialect -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
 
-module @foo attributes {gpu.kernel_module} {
-  // Verify that the attribution was indeed introduced
-  // CHECK-LABEL: @memref3d
-  // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
-  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
-  gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
-    // Verify that loop bounds are emitted, the order does not matter.
-    // CHECK-DAG: %[[c1:.*]] = constant 1
-    // CHECK-DAG: %[[c4:.*]] = constant 4
-    // CHECK-DAG: %[[c5:.*]] = constant 5
-    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
-    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
-    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
-    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
-    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
-    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
-
-    // Verify that loops for the copy are emitted. We only check the number of
-    // loops here since their bounds are produced by mapLoopToProcessorIds,
-    // tested separately.
-    // CHECK: loop.for %[[i0:.*]] =
-    // CHECK:   loop.for %[[i1:.*]] =
-    // CHECK:     loop.for %[[i2:.*]] =
-
-    // Verify that the copy is emitted and uses only the last two loops.
-    // CHECK:       %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
-    // CHECK:       store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
-
-    // Verify that the use has been rewritten.
-    // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
-    "use"(%arg0) : (memref<5x4xf32>) -> ()
-
-
-    // Verify that loops for the copy are emitted. We only check the number of
-    // loops here since their bounds are produced by mapLoopToProcessorIds,
-    // tested separately.
-    // CHECK: loop.for %[[i0:.*]] =
-    // CHECK:   loop.for %[[i1:.*]] =
-    // CHECK:     loop.for %[[i2:.*]] =
-
-    // Verify that the copy is emitted and uses only the last two loops.
-    // CHECK:       %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
-    // CHECK:       store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
-    gpu.return
+module {
+  gpu.module @foo {
+
+    // Verify that the attribution was indeed introduced
+    // CHECK-LABEL: @memref3d
+    // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+    // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+    gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+      // verify that loop bounds are emitted, the order does not matter.
+      // CHECK-DAG: %[[c1:.*]] = constant 1
+      // CHECK-DAG: %[[c4:.*]] = constant 4
+      // CHECK-DAG: %[[c5:.*]] = constant 5
+      // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+      // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+      // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+      // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+      // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+      // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+      // Verify that loops for the copy are emitted. We only check the number of
+      // loops here since their bounds are produced by mapLoopToProcessorIds,
+      // tested separately.
+      // CHECK: loop.for %[[i0:.*]] =
+      // CHECK:   loop.for %[[i1:.*]] =
+      // CHECK:     loop.for %[[i2:.*]] =
+
+      // Verify that the copy is emitted and uses only the last two loops.
+      // CHECK:       %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+      // CHECK:       store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+      // Verify that the use has been rewritten.
+      // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+      "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+      // Verify that loops for the copy are emitted. We only check the number of
+      // loops here since their bounds are produced by mapLoopToProcessorIds,
+      // tested separately.
+      // CHECK: loop.for %[[i0:.*]] =
+      // CHECK:   loop.for %[[i1:.*]] =
+      // CHECK:     loop.for %[[i2:.*]] =
+
+      // Verify that the copy is emitted and uses only the last two loops.
+      // CHECK:       %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+      // CHECK:       store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+      gpu.return
+    }
   }
 }
 
 // -----
 
-module @foo attributes {gpu.kernel_module} {
-  // Verify that the attribution was indeed introduced
-  // CHECK-LABEL: @memref5d
-  // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
-  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
-  gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
-    // Verify that loop bounds are emitted, the order does not matter.
-    // CHECK-DAG: %[[c0:.*]] = constant 0
-    // CHECK-DAG: %[[c1:.*]] = constant 1
-    // CHECK-DAG: %[[c4:.*]] = constant 4
-    // CHECK-DAG: %[[c5:.*]] = constant 5
-    // CHECK-DAG: %[[c6:.*]] = constant 6
-    // CHECK-DAG: %[[c7:.*]] = constant 7
-    // CHECK-DAG: %[[c8:.*]] = constant 8
-    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
-    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
-    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
-    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
-    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
-    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
-
-    // Verify that loops for the copy are emitted.
-    // CHECK: loop.for %[[i0:.*]] =
-    // CHECK:   loop.for %[[i1:.*]] =
-    // CHECK:     loop.for %[[i2:.*]] =
-    // CHECK:       loop.for %[[i3:.*]] =
-    // CHECK:         loop.for %[[i4:.*]] =
-
-    // Verify that the copy is emitted.
-    // CHECK:           %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
-    // CHECK:           store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
-
-    // Verify that the use has been rewritten.
-    // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
-    "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
-
-    // Verify that loop loops for the copy are emitted.
-    // CHECK: loop.for %[[i0:.*]] =
-    // CHECK:   loop.for %[[i1:.*]] =
-    // CHECK:     loop.for %[[i2:.*]] =
-    // CHECK:       loop.for %[[i3:.*]] =
-    // CHECK:         loop.for %[[i4:.*]] =
-
-    // Verify that the copy is emitted.
-    // CHECK:           %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
-    // CHECK:           store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
-    gpu.return
+module {
+  gpu.module @foo {
+
+    // Verify that the attribution was indeed introduced
+    // CHECK-LABEL: @memref5d
+    // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+    // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+    gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+      // Verify that loop bounds are emitted, the order does not matter.
+      // CHECK-DAG: %[[c0:.*]] = constant 0
+      // CHECK-DAG: %[[c1:.*]] = constant 1
+      // CHECK-DAG: %[[c4:.*]] = constant 4
+      // CHECK-DAG: %[[c5:.*]] = constant 5
+      // CHECK-DAG: %[[c6:.*]] = constant 6
+      // CHECK-DAG: %[[c7:.*]] = constant 7
+      // CHECK-DAG: %[[c8:.*]] = constant 8
+      // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+      // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+      // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+      // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+      // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+      // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+      // Verify that loops for the copy are emitted.
+      // CHECK: loop.for %[[i0:.*]] =
+      // CHECK:   loop.for %[[i1:.*]] =
+      // CHECK:     loop.for %[[i2:.*]] =
+      // CHECK:       loop.for %[[i3:.*]] =
+      // CHECK:         loop.for %[[i4:.*]] =
+
+      // Verify that the copy is emitted.
+      // CHECK:           %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+      // CHECK:           store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+      // Verify that the use has been rewritten.
+      // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+      "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+      // Verify that loop loops for the copy are emitted.
+      // CHECK: loop.for %[[i0:.*]] =
+      // CHECK:   loop.for %[[i1:.*]] =
+      // CHECK:     loop.for %[[i2:.*]] =
+      // CHECK:       loop.for %[[i3:.*]] =
+      // CHECK:         loop.for %[[i4:.*]] =
+
+      // Verify that the copy is emitted.
+      // CHECK:           %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+      // CHECK:           store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+      gpu.return
+    }
   }
 }
 
 // -----
 
-module @foo attributes {gpu.kernel_module} {
-  // Check that attribution insertion works fine.
-  // CHECK-LABEL: @insert
-  // CHECK-SAME: (%{{.*}}: memref<4xf32>
-  // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
-  // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
-  // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
-  gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
-      workgroup(%arg1: memref<1x1xf64, 3>)
-      private(%arg2: memref<1x1xi64, 5>)
-      kernel {
-    // CHECK: "use"(%[[wg2]])
-    "use"(%arg0) : (memref<4xf32>) -> ()
-    gpu.return
+module {
+  gpu.module @foo {
+
+    // Check that attribution insertion works fine.
+    // CHECK-LABEL: @insert
+    // CHECK-SAME: (%{{.*}}: memref<4xf32>
+    // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+    // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+    // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+    gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+        workgroup(%arg1: memref<1x1xf64, 3>)
+        private(%arg2: memref<1x1xi64, 5>)
+        kernel {
+      // CHECK: "use"(%[[wg2]])
+      "use"(%arg0) : (memref<4xf32>) -> ()
+      gpu.return
+    }
   }
 }
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
--- a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -25,13 +25,18 @@
 /// or beneficial (e.g., makes previously uncoalesced loads coalesced).
 class TestGpuMemoryPromotionPass
     : public PassWrapper<TestGpuMemoryPromotionPass,
-                         OperationPass<gpu::GPUFuncOp>> {
+                         OperationPass<gpu::GPUModuleOp>> {
   void runOnOperation() override {
-    gpu::GPUFuncOp op = getOperation();
-    for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
-      if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
-        promoteToWorkgroupMemory(op, i);
-    }
+    gpu::GPUModuleOp gpu_module = getOperation();
+    gpu_module.walk([](Operation *operation) {
+      gpu::GPUFuncOp op = llvm::dyn_cast<gpu::GPUFuncOp>(operation);
+      if (op) {
+        for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+          if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+            promoteToWorkgroupMemory(op, i);
+        }
+      }
+    });
   }
 };
 } // end namespace