diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(TransformOps)
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td)
+mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRGPUTransformOpsIncGen)
+
+add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -0,0 +1,75 @@
+//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
+#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
+
+#include "mlir/Dialect/PDL/IR/PDLTypes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gpu {
+class GpuOp;
+} // namespace gpu
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// GPU Transform Operations
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"
+
+namespace mlir {
+class DialectRegistry;
+namespace transform {
+namespace gpu {
+
+/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// op to GPU threads. Mapping is one-to-one and the induction variables of
+/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// which case, the union of the number of threads is computed and may result in
+/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
+/// supported. Dynamic block dim sizes are currently not supported.
+DiagnosedSilenceableFailure
+mapNestedForeachToThreadsImp(RewriterBase &rewriter, Operation *target,
+                             const SmallVectorImpl<int64_t> &blockDim,
+                             bool syncAfterDistribute,
+                             llvm::Optional<TransformOpInterface> transformOp);
+
+/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
+/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
+/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
+/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
+/// dim sizes are currently not supported.
+DiagnosedSilenceableFailure mapForeachToBlocksImp(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
+                      SmallVectorImpl<Value> &)>
+        blockIdGenerator,
+    SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp);
+
+/// Finds the top level scf::ForeachThreadOp of given target.
+DiagnosedSilenceableFailure
+findTopLevelForeachThreadOp(Operation *target,
+                            scf::ForeachThreadOp &topLevelForeachThreadOp,
+                            TransformOpInterface transformOp);
+
+} // namespace gpu
+} // namespace transform
+
+namespace gpu {
+void registerTransformDialectExtension(DialectRegistry &registry);
+} // namespace gpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -0,0 +1,175 @@
+//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_TRANSFORM_OPS
+#define GPU_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/IR/TransformEffects.td"
+include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/PDL/IR/PDLTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+def MapNestedForeachToThreads : 
+  Op<Transform_Dialect, "gpu.map_nested_foreach_to_threads",
+    [FunctionalStyleTransformOpTrait, 
+     MemoryEffectsOpInterface,
+     TransformEachOpTrait,
+     TransformOpInterface]> {
+  let description = [{
+      Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
+      nested in it to distributed `gpu.thread_id` attribute.
+
+      The operation searches for `scf.foreach_thread` ops nested under `target` 
+      and maps each such op to GPU threads. Mapping is one-to-one and the 
+      induction variables of `scf.foreach_thread` are rewritten to 
+      `gpu.thread_id` according to the `thread_dim_mapping` attribute. 
+      
+      Sibling `scf.foreach_thread` are supported in which case, the union of 
+      the number of threads is computed and may result in predication.
+
+      Multiple scf.foreach_thread are supported per `gpu.launch` in which case, 
+      the max of all the threads is computed and taken for the global 
+      `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the 
+      whole thread range result in predicated computations.
+
+      Dynamic `scf.foreach_thread` trip counts are currently not supported. 
+      Dynamic block dim sizes are currently not supported.
+
+      Only **bufferized** `scf.foreach_thread` are currently supported.
+      Only `scf.foreach_thread` distributed to **at most 3 dimensions** are 
+      currently supported.
+    
+      Barriers are inserted after each scf.foreach_thread op for now.
+
+      The operation alters the block size of the given gpu_launch using 
+      blockDim argument.
+
+      #### Return modes:
+      
+      This operation ignores non-gpu_launch ops and drops them in the return.
+
+      If any scf.foreach_thread with tensors is found, the transform definitely 
+      fails.    
+
+      If all the scf.foreach_thread operations contained within the LaunchOp 
+      referred to by the `target` PDLOperation lower to GPU properly, the 
+      transform succeeds. Otherwise the transform definitely fails.
+      
+      The returned handle points to the same LaunchOp operand, consuming it and
+      producing a new SSA value to satisfy chaining and linearity of the IR 
+      properties.
+
+      #### Example:
+
+      ```
+      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
+                 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
+        scf.foreach_thread (%i, %j) in (7, 9) {
+          ... // body 1
+        } {thread_dim_mapping = [1, 0, 2]}
+        scf.foreach_thread (%i) in (12) {
+          ... // body 2
+        }
+        gpu.terminator
+      }
+      ```
+      is translated to:
+
+      ```
+      %bdimX = arith.constant 12 : index
+      %bdimY = arith.constant 9 : index
+      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
+             threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
+        if (threadIdx.x < 9 && threadIdx.y < 7) {
+          ... // body 1
+        }
+        gpu.barrier
+        if (threadIdx.y < 1) {
+          ... // body 2
+        }
+        gpu.barrier
+        gpu.terminator
+      }      
+      ```
+    }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
+                   DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{    
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+
+def MapForeachToBlocks : 
+  Op<Transform_Dialect, "gpu.map_foreach_to_blocks",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait]> {
+  let description = [{
+    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
+    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 
+    is set, then first generates `gpu_launch` and moves the top level 
+    `scf.foreach_thread` inside.
+
+    The operation searches top level `scf.foreach_thread` ops under 
+    `gpu_launch` and maps each such op to GPU blocks. Mapping is 
+    one-to-one and the induction variables of `scf.foreach_thread` are 
+    rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
+
+    Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
+    Dynamic block dim sizes are currently not supported.
+
+    Only **bufferized** scf.foreach_thread are currently supported.
+    Only scf.foreach_thread distributed to **at most 3 dimensions** are 
+    currently supported.
+
+    The operation alters the block size of the given gpu_launch using 
+    gridDim argument.
+
+    #### Return modes:
+    
+    This operation ignores non-gpu_launch ops and drops them in the return.
+
+    If any scf.foreach_thread with tensors is found, the transform definitely 
+    fails.    
+
+    If all the scf.foreach_thread operations contained within the LaunchOp 
+    referred to by the `target` PDLOperation lower to GPU properly, the 
+    transform succeeds. Otherwise the transform definitely fails.
+
+    The returned handle points to the same LaunchOp operand, consuming it and
+    producing a new SSA value to satisfy chaining and linearity of the IR 
+    properties.
+  }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
+                   UnitAttr:$generate_gpu_launch);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+#endif // GPU_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -751,161 +751,6 @@
   }];
 }
 
-def MapNestedForeachThreadToGpuThreads : 
-  Op<Transform_Dialect, "structured.map_nested_foreach_thread_to_gpu_threads",
-    [FunctionalStyleTransformOpTrait, 
-     MemoryEffectsOpInterface,
-     TransformEachOpTrait,
-     TransformOpInterface]> {
-  let description = [{
-      Target the gpu_launch op and rewrite all scf.foreach_thread
-      to distributed gpu.thread_id attribute.
-
-      The operation searches `scf.foreach_thread` ops nested under `target` 
-      and maps each such op to GPU threads. Mapping is one-to-one and the 
-      induction variables of `scf.foreach_thread` are rewritten to 
-      gpu.thread_id according to the thread_dim_apping attribute. 
-      
-      Sibling `scf.foreach_thread` are supported in which case, the union of 
-      the number of threads is computed and may result in predication.
-
-      Multiple scf.foreach_thread are supported per function in which case, the
-      max of all the threads is computed and taken for the global gpu.thread_id.
-      If necessary, scf.foreach_thread that do not use the whole thread range 
-      result in predicated computations.
-
-      Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
-      Dynamic block dim sizes are currently not supported.
-
-      Only **bufferized** scf.foreach_thread are currently supported.
-      Only scf.foreach_thread distributed to **at most 3 dimensions** are 
-      currently supported.
-    
-      Barriers are inserted after each scf.foreach_thread op for now.
-
-      The operation alters the block size of the given gpu_launch using 
-      blockDim argument.
-
-      #### Return modes:
-      
-      This operation ignores non-gpu_launch ops and drops them in the return.
-
-      If any scf.foreach_thread with tensors is found, the transform definitely 
-      fails.    
-
-      If all the scf.foreach_thread operations contained within the LaunchOp 
-      referred to by the `target` PDLOperation lower to GPU properly, the 
-      transform succeeds. Otherwise the transform definitely fails.
-      
-      The returned handle points to the same LaunchOp operand, consuming it and
-      producing a new SSA value to satisfy chaining and linearity of the IR 
-      properties.
-
-      #### Example:
-
-      ```
-      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
-                 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
-        scf.foreach_thread (%i, %j) in (7, 9) {
-          ... // body 1
-        } {thread_dim_mapping = [1, 0, 2]}
-        scf.foreach_thread (%i) in (12) {
-          ... // body 2
-        }
-        gpu.terminator
-      }
-      ```
-      is translated to:
-
-      ```
-      %bdimX = arith.constant 12 : index
-      %bdimY = arith.constant 9 : index
-      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
-             threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
-        if (threadIdx.x < 9 && threadIdx.y < 7) {
-          ... // body 1
-        }
-        gpu.barrier
-        if (threadIdx.y < 1) {
-          ... // body 2
-        }
-        gpu.barrier
-        gpu.terminator
-      }      
-      ```
-    }];
-
-  let arguments = (ins PDL_Operation:$target,
-                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
-                   DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
-  let results = (outs PDL_Operation:$result);
-
-  let assemblyFormat = "$target attr-dict";
-  let extraClassDeclaration = [{
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::Operation *target, 
-        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
-        ::mlir::transform::TransformState &state);
-  }];
-}
-
-def MapNestedForeachThreadToGpuBlocks : Op<Transform_Dialect, 
-    "structured.map_nested_foreach_thread_to_gpu_blocks",
-    [FunctionalStyleTransformOpTrait,
-     MemoryEffectsOpInterface,
-     TransformOpInterface,
-     TransformEachOpTrait]> {
-  let description = [{
-    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
-    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 
-    is set, then first generates `gpu_launch` and moves the top level 
-    `scf.foreach_thread` inside.
-
-    The operation searches top level `scf.foreach_thread` ops under 
-    `gpu_launch` and maps each such op to GPU blocks. Mapping is 
-    one-to-one and the induction variables of `scf.foreach_thread` are 
-    rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
-
-    Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
-    Dynamic block dim sizes are currently not supported.
-
-    Only **bufferized** scf.foreach_thread are currently supported.
-    Only scf.foreach_thread distributed to **at most 3 dimensions** are 
-    currently supported.
-
-    The operation alters the block size of the given gpu_launch using 
-    gridDim argument.
-
-    #### Return modes:
-    
-    This operation ignores non-gpu_launch ops and drops them in the return.
-
-    If any scf.foreach_thread with tensors is found, the transform definitely 
-    fails.    
-
-    If all the scf.foreach_thread operations contained within the LaunchOp 
-    referred to by the `target` PDLOperation lower to GPU properly, the 
-    transform succeeds. Otherwise the transform definitely fails.
-
-    The returned handle points to the same LaunchOp operand, consuming it and
-    producing a new SSA value to satisfy chaining and linearity of the IR 
-    properties.
-  }];
-
-  let arguments = (ins PDL_Operation:$target,
-                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
-                   UnitAttr:$generate_gpu_launch);
-  let results = (outs PDL_Operation:$result);
-
-  let assemblyFormat = "$target attr-dict";
-  let extraClassDeclaration = [{
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::Operation *target, 
-        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
-        ::mlir::transform::TransformState &state);
-  }];
-}
-
 def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -125,32 +125,6 @@
 FailureOr<Operation *> fuseElementwiseOps(RewriterBase &rewriter,
                                           OpOperand *fusedOperand);
 
-/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
-/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
-/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
-/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
-/// dim sizes are currently not supported.
-LogicalResult rewriteTopLevelForeachThreadToGpuBlocks(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVector<Value> &)>
-        blockIdGenerator,
-    SmallVector<int64_t> &gridDims);
-
-/// Finds the top level scf::ForeachThreadOp of given target.
-FailureOr<scf::ForeachThreadOp> findTopLevelForeachThreadOp(Operation *target);
-
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
-/// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
-/// which case, the union of the number of threads is computed and may result in
-/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
-/// supported. Dynamic block dim sizes are currently not supported.
-mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads(
-    RewriterBase &rewriter, Operation *target,
-    const SmallVector<int64_t> &blockDim, bool syncAfterDistribute);
-
 /// Split the given `op` into two parts along the given iteration space
 /// `dimension` at the specified `splitPoint`, and return the two parts.
 ///
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -31,6 +31,7 @@
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -115,6 +116,7 @@
   linalg::registerTransformDialectExtension(registry);
   memref::registerTransformDialectExtension(registry);
   scf::registerTransformDialectExtension(registry);
+  gpu::registerTransformDialectExtension(registry);
 
   // Register all external models.
   arith::registerBufferizableOpInterfaceExternalModels(registry);
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -82,6 +82,8 @@
   MLIRTransformUtils
   )
 
+add_subdirectory(TransformOps)
+
 if(MLIR_ENABLE_CUDA_RUNNER)
   if(NOT MLIR_ENABLE_CUDA_CONVERSIONS)
     message(SEND_ERROR
diff --git a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_dialect_library(MLIRGPUTransformOps
+  GPUTransformOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps
+
+  DEPENDS
+  MLIRGPUTransformOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRGPUTransforms
+  MLIRParser
+  MLIRPDLDialect
+  MLIRSideEffectInterfaces
+  MLIRTransformDialect
+  MLIRGPUOps
+  )  
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -0,0 +1,507 @@
+//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/PDL/IR/PDL.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::transform;
+
+namespace {
+/// A simple pattern rewriter that implements no special logic.
+class SimpleRewriter : public PatternRewriter {
+public:
+  SimpleRewriter(MLIRContext *context) : PatternRewriter(context) {}
+};
+} // namespace
+
+/// Determines if the size of the kernel configuration is supported by the GPU
+/// architecture being used. It presently makes use of CUDA limitations, however
+/// that aspect may be enhanced for other GPUs.
+static DiagnosedSilenceableFailure
+checkGpuLimits(TransformOpInterface transformOp, Optional<int64_t> gridDimX,
+               Optional<int64_t> gridDimY, Optional<int64_t> gridDimZ,
+               Optional<int64_t> blockDimX, Optional<int64_t> blockDimY,
+               Optional<int64_t> blockDimZ) {
+
+  static constexpr int max_total_blockdim = 1024;
+  static constexpr int max_blockdimx = 1024;
+  static constexpr int max_blockdimy = 1024;
+  static constexpr int max_blockdimz = 64;
+  static constexpr int max_total_griddim = 2147483647;
+  static constexpr int max_griddimx = 2147483647;
+  static constexpr int max_griddimy = 65535;
+  static constexpr int max_griddimz = 65535;
+
+  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
+          max_total_blockdim ||
+      (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
+          max_total_griddim ||
+      blockDimX.value_or(1) > max_blockdimx ||
+      blockDimY.value_or(1) > max_blockdimy ||
+      blockDimZ.value_or(1) > max_blockdimz ||
+      gridDimY.value_or(1) > max_griddimy ||
+      gridDimZ.value_or(1) > max_griddimz ||
+      gridDimX.value_or(1) > max_griddimx) {
+    return transformOp.emitSilenceableError()
+           << "Trying to launch a GPU kernel with gridDim = ("
+           << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
+           << gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)
+           << ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)
+           << "). It is larger than the limits.";
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and
+/// put a terminator within.
+static DiagnosedSilenceableFailure
+createGpuLaunch(RewriterBase &rewriter, Location loc,
+                TransformOpInterface transformOp, LaunchOp &launchOp,
+                Optional<int64_t> gridDimX = llvm::None,
+                Optional<int64_t> gridDimY = llvm::None,
+                Optional<int64_t> gridDimZ = llvm::None,
+                Optional<int64_t> blockDimX = llvm::None,
+                Optional<int64_t> blockDimY = llvm::None,
+                Optional<int64_t> blockDimZ = llvm::None) {
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+                     blockDimY, blockDimZ);
+  if (!diag.succeeded())
+    return diag;
+
+  auto createConst = [&](int dim) {
+    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
+  };
+  OpBuilder::InsertionGuard guard(rewriter);
+  Value one = createConst(1);
+  Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
+  Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
+  Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
+  Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
+  Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
+  Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
+  launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
+                                       blkSizeX, blkSizeY, blkSizeZ);
+  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
+  rewriter.create<TerminatorOp>(loc);
+  return DiagnosedSilenceableFailure(success());
+}
+
+/// Alter kernel configuration of the given kernel.
+static DiagnosedSilenceableFailure
+alterGpuLaunch(SimpleRewriter &rewriter, LaunchOp gpuLaunch,
+               TransformOpInterface transformOp,
+               Optional<int64_t> gridDimX = llvm::None,
+               Optional<int64_t> gridDimY = llvm::None,
+               Optional<int64_t> gridDimZ = llvm::None,
+               Optional<int64_t> blockDimX = llvm::None,
+               Optional<int64_t> blockDimY = llvm::None,
+               Optional<int64_t> blockDimZ = llvm::None) {
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+                     blockDimY, blockDimZ);
+  if (!diag.succeeded())
+    return diag;
+
+  KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
+  auto createConstValue = [&](int dim) {
+    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
+                                                   dim);
+  };
+
+  if (gridDimX.has_value())
+    gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
+  if (gridDimY.has_value())
+    gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
+  if (gridDimZ.has_value())
+    gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
+  if (blockDimX.has_value())
+    gpuLaunch.getBlockSizeXMutable().assign(
+        createConstValue(blockDimX.value()));
+  if (blockDimY.has_value())
+    gpuLaunch.getBlockSizeYMutable().assign(
+        createConstValue(blockDimY.value()));
+  if (blockDimZ.has_value())
+    gpuLaunch.getBlockSizeZMutable().assign(
+        createConstValue(blockDimZ.value()));
+  return DiagnosedSilenceableFailure::success();
+}
+
+//===----------------------------------------------------------------------===//
+// MapForeachToBlocks
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImp(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
+                      SmallVectorImpl<Value> &)>
+        blockIdGenerator,
+    SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp) {
+  if (foreachThreadOp.getNumResults() > 0)
+    return transformOp.emitSilenceableError()
+           << "only bufferized scf.foreach_thread lowers to gpu.block_id";
+  if (foreachThreadOp.getNumThreads().size() > 3)
+    return transformOp.emitSilenceableError()
+           << "scf.foreach_thread with rank > 3 does not lower to gpu.block_id";
+
+  // Step 0. Outline the compute workload region and set up the workload
+  // operands.
+  FailureOr<SmallVector<OpFoldResult>> potentialGridDim =
+      foreachThreadOp.getPermutedNumThreads(rewriter);
+
+  if (failed(potentialGridDim) ||
+      llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
+        return !getConstantIntValue(ofr).has_value();
+      })) {
+    return transformOp.emitSilenceableError() << "unsupported dynamic gridDim";
+  }
+
+  for (OpFoldResult ofr : *potentialGridDim)
+    gridDims.push_back(getConstantIntValue(ofr).value());
+
+  SmallVector<Value> blockOps;
+  blockIdGenerator(rewriter, foreachThreadOp, blockOps);
+
+  // Step 1. Move the body of foreachThreadOp.
+  // Erase the terminator first, it will not be used since we are on buffers.
+  rewriter.eraseOp(foreachThreadOp.getTerminator());
+  Block *targetBlock = foreachThreadOp->getBlock();
+  Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
+  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  targetBlock->getOperations().splice(insertionPoint,
+                                      sourceBlock.getOperations());
+
+  // Step 2. RAUW thread indices to thread ops.
+  SmallVector<Value> threadIndices =
+      *foreachThreadOp.getPermutedThreadIndices();
+  assert(blockOps.size() == 3 && "3 block id ops are required");
+  for (auto [blockIdx, blockOp] : llvm::zip(threadIndices, blockOps)) {
+    Value val = blockIdx;
+    Value blkOp = blockOp;
+    if (!val)
+      continue;
+    for (Operation *user : llvm::make_early_inc_range(val.getUsers()))
+      user->replaceUsesOfWith(val, blkOp);
+  }
+
+  // Step 3. Erase old op.
+  rewriter.eraseOp(foreachThreadOp);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure mlir::transform::gpu::findTopLevelForeachThreadOp(
+    Operation *target, scf::ForeachThreadOp &topLevelForeachThreadOp,
+    TransformOpInterface transformOp) {
+  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
+    if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
+      return WalkResult::advance();
+    if (topLevelForeachThreadOp)
+      // TODO: Handle multiple foreach if there is no dependences between them
+      return WalkResult::interrupt();
+    topLevelForeachThreadOp = foreachThreadOp;
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return transformOp.emitSilenceableError()
+           << "could not find a unique topLevel scf.foreach_thread";
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// This is a helper that is only used in
+/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id.
+static void generateGpuBlockIds(RewriterBase &rewriter,
+                                scf::ForeachThreadOp foreachOp,
+                                SmallVectorImpl<Value> &blockOps) {
+  Location loc = foreachOp->getLoc();
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(foreachOp);
+  IndexType indexType = rewriter.getIndexType();
+  SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
+  for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
+    blockOps.push_back(
+        rewriter.create<BlockIdOp>(loc, indexType, gpuDims[idx]));
+  }
+}
+
+DiagnosedSilenceableFailure
+transform::MapForeachToBlocks::applyToOne(Operation *target,
+                                          SmallVectorImpl<Operation *> &results,
+                                          transform::TransformState &state) {
+  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
+  SimpleRewriter rewriter(getContext());
+  auto transformOp = cast<TransformOpInterface>(getOperation());
+
+  if (!getGenerateGpuLaunch() && !gpuLaunch) {
+    results.assign({target});
+    DiagnosedSilenceableFailure diag =
+        emitSilenceableError()
+        << "Given target is not gpu.launch, set `generate_gpu_launch` "
+           "attribute";
+    diag.attachNote(target->getLoc()) << "when applied to this payload op";
+    return diag;
+  }
+
+  scf::ForeachThreadOp topLevelForeachThreadOp;
+  DiagnosedSilenceableFailure diag =
+      mlir::transform::gpu::findTopLevelForeachThreadOp(
+          target, topLevelForeachThreadOp, transformOp);
+  if (!diag.succeeded()) {
+    results.assign({target});
+    diag.attachNote(target->getLoc()) << "when applied to this payload op";
+    return diag;
+  }
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(topLevelForeachThreadOp);
+
+  // Generate gpu launch here and move the foreach_thread inside
+  if (getGenerateGpuLaunch()) {
+    DiagnosedSilenceableFailure diag =
+        createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
+    if (!diag.succeeded()) {
+      results.assign({target});
+      return diag;
+    }
+    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
+    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
+    rewriter.eraseOp(topLevelForeachThreadOp);
+    topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
+  }
+
+  SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
+  diag = mlir::transform::gpu::mapForeachToBlocksImp(
+      rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim,
+      transformOp);
+  if (diag.succeeded()) {
+    diag = alterGpuLaunch(rewriter, gpuLaunch,
+                          cast<TransformOpInterface>(getOperation()),
+                          gridDim[0], gridDim[1], gridDim[2]);
+  }
+
+  results.assign({gpuLaunch});
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// MapNestedForeachToThreads
+//===----------------------------------------------------------------------===//
+
+/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// op to GPU threads. Mapping is one-to-one and the induction variables of
+/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// which case, the union of the number of threads is computed and may result
+/// in predication. Dynamic, `scf.foreach_thread` trip counts are currently
+/// not supported. Dynamic block dim sizes are currently not supported.
+static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    const SmallVectorImpl<int64_t> &globalBlockDims, bool syncAfterDistribute,
+    llvm::Optional<TransformOpInterface> transformOp) {
+  auto failureHelper =
+      [&](const Twine &message) -> DiagnosedSilenceableFailure {
+    if (transformOp.has_value()) {
+      return transformOp->emitSilenceableError() << message;
+    }
+    foreachThreadOp->emitError() << message;
+    return DiagnosedSilenceableFailure::definiteFailure();
+  };
+
+  if (foreachThreadOp.getNumResults() > 0)
+    return failureHelper(
+        "only bufferized scf.foreach_thread lowers to gpu.thread_id");
+
+  if (foreachThreadOp.getNumThreads().size() > 3)
+    return failureHelper(
+        "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
+
+  auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
+  if (failed(potentialBlockDim) ||
+      llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
+        return !getConstantIntValue(ofr).has_value();
+      })) {
+    return failureHelper("unsupported dynamic blockdim size");
+  }
+
+  SmallVector<int64_t> blockDim =
+      llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
+        return getConstantIntValue(ofr).value();
+      }));
+
+  // Step 1. Create the gpu.thread ops
+  Location loc = foreachThreadOp.getLoc();
+  IndexType indexType = rewriter.getIndexType();
+
+  SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
+  SmallVector<Value> threadOps;
+  for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
+    threadOps.push_back(
+        rewriter.create<ThreadIdOp>(loc, indexType, gpuDims[idx]));
+  }
+  // Step 2. Maybe create conditionals to predicate the region.
+  Value predicate;
+  for (auto [threadId, blockDim, globalBlockDim] :
+       llvm::zip(threadOps, blockDim, globalBlockDims)) {
+    if (blockDim > globalBlockDim) {
+      return failureHelper(
+          "The GPU threads are fewer than the loop trip counts. "
+          "Try to tile scf.foreach_thread before mapping.");
+    }
+    if (blockDim == globalBlockDim)
+      continue;
+    Value blockIdx = rewriter.create<arith::ConstantIndexOp>(loc, blockDim);
+    Value tmpPredicate = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::ult, threadId, blockIdx);
+    predicate =
+        predicate ? rewriter.create<arith::AndIOp>(loc, predicate, tmpPredicate)
+                  : tmpPredicate;
+  }
+
+  // Step 3. Move the body of foreachThreadOp.
+  // Erase the terminator first, it will not be used.
+  rewriter.eraseOp(foreachThreadOp.getTerminator());
+  Block *targetBlock;
+  Block::iterator insertionPoint;
+  if (predicate) {
+    // Step 3.a. If predicated, move at the beginning.
+    auto ifOp =
+        rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
+    targetBlock = ifOp.thenBlock();
+    insertionPoint = ifOp.thenBlock()->begin();
+  } else {
+    // Step 3.a. Otherwise, move inline just before foreachThreadOp.
+    targetBlock = foreachThreadOp->getBlock();
+    insertionPoint = Block::iterator(foreachThreadOp);
+  }
+  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  targetBlock->getOperations().splice(insertionPoint,
+                                      sourceBlock.getOperations());
+
+  // Step 4. RAUW thread indices to thread ops.
+  SmallVector<Value> threadIndices =
+      *foreachThreadOp.getPermutedThreadIndices();
+  for (auto [threadIdx, threadOp] : llvm::zip(threadIndices, threadOps)) {
+    Value val = threadIdx;
+    Value op = threadOp;
+    if (!val)
+      continue;
+    for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
+      user->replaceUsesOfWith(val, op);
+    }
+  }
+
+  // Step 5. syncthreads.
+  // TODO: Need warpsync
+  if (syncAfterDistribute)
+    rewriter.create<BarrierOp>(loc);
+
+  // Step 6. Erase old op.
+  rewriter.eraseOp(foreachThreadOp);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForeachToThreadsImp(
+    RewriterBase &rewriter, Operation *target,
+    const SmallVectorImpl<int64_t> &blockDim, bool syncAfterDistribute,
+    llvm::Optional<TransformOpInterface> transformOp) {
+  DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
+  target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
+    rewriter.setInsertionPoint(foreachThreadOp);
+    diag = rewriteOneForeachThreadToGpuThreads(
+        rewriter, foreachThreadOp, blockDim, syncAfterDistribute, transformOp);
+    return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
+  });
+  return diag;
+}
+
+DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne(
+    ::mlir::Operation *target,
+    ::llvm::SmallVectorImpl<::mlir::Operation *> &results,
+    ::mlir::transform::TransformState &state) {
+  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
+  auto transformOp = cast<TransformOpInterface>(getOperation());
+
+  if (!gpuLaunch) {
+    results.assign({target});
+    return emitSilenceableError() << "Given target is not gpu.launch";
+  }
+
+  SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
+  blockDim.resize(/*size=*/3, /*value=*/1);
+
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, llvm::None, llvm::None, llvm::None,
+                     blockDim[0], blockDim[1], blockDim[2]);
+  if (diag.isSilenceableFailure()) {
+    results.assign({target});
+    diag.attachNote(getLoc()) << getBlockDimAttrName() << " is very large";
+    return diag;
+  }
+
+  SimpleRewriter rewriter(getContext());
+  rewriter.setInsertionPoint(target);
+
+  diag = mlir::transform::gpu::mapNestedForeachToThreadsImp(
+      rewriter, target, blockDim, getSyncAfterDistribute(), llvm::None);
+  if (diag.succeeded()) {
+    diag =
+        alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None, llvm::None,
+                       llvm::None, blockDim[0], blockDim[1], blockDim[2]);
+  }
+
+  results.assign({gpuLaunch});
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// Transform op registration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Registers new ops and declares PDL as dependent dialect since the
+/// additional ops are using PDL types for operands and results.
+class GPUTransformDialectExtension
+    : public transform::TransformDialectExtension<
+          GPUTransformDialectExtension> {
+public:
+  GPUTransformDialectExtension() {
+    declareDependentDialect<pdl::PDLDialect>();
+    declareGeneratedDialect<scf::SCFDialect>();
+    declareGeneratedDialect<arith::ArithDialect>();
+    declareGeneratedDialect<GPUDialect>();
+    registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
+        >();
+  }
+};
+} // namespace
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
+
+void mlir::gpu::registerTransformDialectExtension(DialectRegistry &registry) {
+  registry.addExtensions<GPUTransformDialectExtension>();
+}
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1167,392 +1167,6 @@
   modifiesPayload(effects);
 }
 
-//===----------------------------------------------------------------------===//
-// MapNestedForeachThreadToGpuThreads
-//===----------------------------------------------------------------------===//
-
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
-/// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
-/// which case, the union of the number of threads is computed and may result in
-/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
-/// supported. Dynamic block dim sizes are currently not supported.
-static FailureOr<SmallVector<OpFoldResult>> rewriteOneForeachThreadToGpuThreads(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    const SmallVector<int64_t> &globalBlockDims, bool syncAfterDistribute) {
-  if (foreachThreadOp.getNumResults() > 0)
-    return foreachThreadOp->emitError(
-        "only bufferized scf.foreach_thread lowers to gpu.thread");
-  if (foreachThreadOp.getNumThreads().size() > 3)
-    return foreachThreadOp->emitError(
-        "scf.foreach_thread with rank > 3 does not lower to gpu.thread");
-
-  auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
-  if (failed(potentialBlockDim) ||
-      llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
-        return !getConstantIntValue(ofr).has_value();
-      }))
-    return foreachThreadOp->emitError("unsupported dynamic blockdim size");
-
-  SmallVector<int64_t> blockDim =
-      llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
-        return getConstantIntValue(ofr).value();
-      }));
-
-  // Step 1. Create the gpu.thread ops
-  Location loc = foreachThreadOp.getLoc();
-  IndexType indexType = rewriter.getIndexType();
-
-  SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
-                                      gpu::Dimension::z};
-  SmallVector<Value> threadOps;
-  for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
-    threadOps.push_back(
-        rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpuDims[idx]));
-  }
-  // Step 2. Maybe create conditionals to predicate the region.
-  Value predicate;
-  for (auto [threadId, blockDim, globalBlockDim] :
-       llvm::zip(threadOps, blockDim, globalBlockDims)) {
-    if (blockDim > globalBlockDim) {
-      return foreachThreadOp.emitOpError("blockDim size overflow: ")
-             << blockDim << " > " << globalBlockDim;
-    }
-    if (blockDim == globalBlockDim)
-      continue;
-    Value tmpPredicate = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ult, threadId,
-        rewriter.create<arith::ConstantIndexOp>(loc, blockDim));
-    predicate =
-        predicate ? rewriter.create<arith::AndIOp>(loc, predicate, tmpPredicate)
-                  : tmpPredicate;
-  }
-
-  // Step 3. Move the body of foreachThreadOp.
-  // Erase the terminator first, it will not be used.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
-  Block *targetBlock;
-  Block::iterator insertionPoint;
-  if (predicate) {
-    // Step 3.a. If predicated, move at the beginning.
-    auto ifOp =
-        rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
-    targetBlock = ifOp.thenBlock();
-    insertionPoint = ifOp.thenBlock()->begin();
-  } else {
-    // Step 3.a. Otherwise, move inline just before foreachThreadOp.
-    targetBlock = foreachThreadOp->getBlock();
-    insertionPoint = Block::iterator(foreachThreadOp);
-  }
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
-  targetBlock->getOperations().splice(insertionPoint,
-                                      sourceBlock.getOperations());
-
-  // Step 4. RAUW thread indices to thread ops.
-  SmallVector<Value> threadIndices =
-      *foreachThreadOp.getPermutedThreadIndices();
-  for (auto it : llvm::zip(threadIndices, threadOps)) {
-    Value val = std::get<0>(it);
-    if (!val)
-      continue;
-    for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
-      rewriter.updateRootInPlace(
-          user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
-    }
-  }
-
-  // Step 5. syncthreads.
-  // TODO: Need warpsync
-  if (syncAfterDistribute)
-    rewriter.create<gpu::BarrierOp>(loc);
-
-  // Step 6. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
-
-  return *potentialBlockDim;
-}
-
-mlir::WalkResult mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
-    RewriterBase &rewriter, Operation *target,
-    const SmallVector<int64_t> &blockDim, bool syncAfterDistribute) {
-  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
-    rewriter.setInsertionPoint(foreachThreadOp);
-    if (failed(rewriteOneForeachThreadToGpuThreads(
-            rewriter, foreachThreadOp, blockDim, syncAfterDistribute)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  });
-  return walkResult;
-}
-
-static LogicalResult
-checkGpuLimits(Optional<int64_t> gridDimX, Optional<int64_t> gridDimY,
-               Optional<int64_t> gridDimZ, Optional<int64_t> blockDimX,
-               Optional<int64_t> blockDimY, Optional<int64_t> blockDimZ) {
-  // TODO The limits should live in the gpu dialect, but it's not like that
-  // right now. Read them in the common gpu dialect
-  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
-          1024 ||
-      gridDimY.value_or(1) > 65535 || gridDimZ.value_or(1) > 65535 ||
-      gridDimX.value_or(1) > 2147483647)
-    return failure();
-  return success();
-}
-
-/// Alter grid or block dimensions of the given kernel
-static LogicalResult alterGpuLaunch(SimpleRewriter &rewriter,
-                                    gpu::LaunchOp gpuLaunch,
-                                    Optional<int64_t> gridDimX = llvm::None,
-                                    Optional<int64_t> gridDimY = llvm::None,
-                                    Optional<int64_t> gridDimZ = llvm::None,
-                                    Optional<int64_t> blockDimX = llvm::None,
-                                    Optional<int64_t> blockDimY = llvm::None,
-                                    Optional<int64_t> blockDimZ = llvm::None) {
-  if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                            blockDimZ))) {
-    gpuLaunch->emitError(
-        "Requested kernel thread configuration is larger than the limits");
-    return failure();
-  }
-
-  gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
-  auto createConstValue = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
-                                                   dim);
-  };
-
-  if (gridDimX.has_value())
-    gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
-  if (gridDimY.has_value())
-    gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
-  if (gridDimZ.has_value())
-    gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
-  if (blockDimX.has_value())
-    gpuLaunch.getBlockSizeXMutable().assign(
-        createConstValue(blockDimX.value()));
-  if (blockDimY.has_value())
-    gpuLaunch.getBlockSizeYMutable().assign(
-        createConstValue(blockDimY.value()));
-  if (blockDimZ.has_value())
-    gpuLaunch.getBlockSizeZMutable().assign(
-        createConstValue(blockDimZ.value()));
-  return success();
-}
-
-DiagnosedSilenceableFailure
-transform::MapNestedForeachThreadToGpuThreads::applyToOne(
-    Operation *target, SmallVectorImpl<Operation *> &results,
-    transform::TransformState &state) {
-
-  gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
-  if (!gpuLaunch) {
-    target->emitError("Given target is not gpu.launch");
-    return DiagnosedSilenceableFailure::definiteFailure();
-  }
-
-  SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
-  blockDim.resize(/*size=*/3, /*value=*/1);
-  SimpleRewriter rewriter(getContext());
-  rewriter.setInsertionPoint(target);
-  auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
-      rewriter, target, blockDim, getSyncAfterDistribute());
-  if (walkResult.wasInterrupted())
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-
-  LogicalResult result =
-      alterGpuLaunch(rewriter, gpuLaunch, llvm::None, llvm::None, llvm::None,
-                     blockDim[0], blockDim[1], blockDim[2]);
-  if (failed(result))
-    return DiagnosedSilenceableFailure::definiteFailure();
-
-  results.assign({target});
-  return DiagnosedSilenceableFailure(success());
-}
-
-//===----------------------------------------------------------------------===//
-// MapNestedForeachThreadToGpuBlocks
-//===----------------------------------------------------------------------===//
-
-LogicalResult mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVector<Value> &)>
-        blockIdGenerator,
-    SmallVector<int64_t> &gridDims) {
-  if (foreachThreadOp.getNumResults() > 0)
-    return foreachThreadOp->emitError(
-        "only bufferized scf.foreach_thread lowers to gpu.block_id");
-  if (foreachThreadOp.getNumThreads().size() > 3)
-    return foreachThreadOp->emitError(
-        "scf.foreach_thread with rank > 3 does not lower to gpu.block_id");
-
-  // Step 0. Outline the compute workload region and set up the workload
-  // operands.
-  auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter);
-  if (failed(potentialGridDim) ||
-      llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
-        return !getConstantIntValue(ofr).has_value();
-      }))
-    return foreachThreadOp->emitError("unsupported dynamic gridDim");
-
-  for (OpFoldResult ofr : *potentialGridDim)
-    gridDims.push_back(getConstantIntValue(ofr).value());
-
-  SmallVector<Value> blockOps;
-  blockIdGenerator(rewriter, foreachThreadOp, blockOps);
-
-  // Step 1. Move the body of foreachThreadOp.
-  // Erase the terminator first, it will not be used since we are on buffers.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
-  Block *targetBlock = foreachThreadOp->getBlock();
-  Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
-  targetBlock->getOperations().splice(insertionPoint,
-                                      sourceBlock.getOperations());
-
-  // Step 2. RAUW thread indices to thread ops.
-  SmallVector<Value> threadIndices =
-      *foreachThreadOp.getPermutedThreadIndices();
-  assert(blockOps.size() == 3 && "3 block id ops are required");
-  for (auto it : llvm::zip(threadIndices, blockOps)) {
-    Value val = std::get<0>(it);
-    if (!val)
-      continue;
-    for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
-      rewriter.updateRootInPlace(
-          user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
-    }
-  }
-
-  // Step 3. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
-
-  return success();
-}
-
-FailureOr<scf::ForeachThreadOp>
-mlir::linalg::findTopLevelForeachThreadOp(Operation *target) {
-  scf::ForeachThreadOp topLevelForeachThreadOp;
-  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
-    if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
-      return WalkResult::advance();
-    if (topLevelForeachThreadOp)
-      // TODO Handle multiple foreach if there is no dependences between them
-      return WalkResult::interrupt();
-    topLevelForeachThreadOp = foreachThreadOp;
-    return WalkResult::advance();
-  });
-
-  if (walkResult.wasInterrupted())
-    return target->emitError(
-        "could not find a unique topLevel scf.foreach_thread");
-
-  return topLevelForeachThreadOp;
-}
-
-/// Create gpuLauncOp with given kernel configurations
-static FailureOr<gpu::LaunchOp>
-createGpuLaunch(RewriterBase &rewriter, Location loc,
-                Optional<int64_t> gridDimX = llvm::None,
-                Optional<int64_t> gridDimY = llvm::None,
-                Optional<int64_t> gridDimZ = llvm::None,
-                Optional<int64_t> blockDimX = llvm::None,
-                Optional<int64_t> blockDimY = llvm::None,
-                Optional<int64_t> blockDimZ = llvm::None) {
-  if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                            blockDimZ)))
-    return failure();
-  auto createConstant = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
-  };
-  Value one = createConstant(1);
-  Value gridSizeX =
-      gridDimX.has_value() ? createConstant(gridDimX.value()) : one;
-  Value gridSizeY =
-      gridDimY.has_value() ? createConstant(gridDimY.value()) : one;
-  Value gridSizeZ =
-      gridDimZ.has_value() ? createConstant(gridDimZ.value()) : one;
-  Value blockSizeX =
-      blockDimX.has_value() ? createConstant(blockDimX.value()) : one;
-  Value blockSizeY =
-      blockDimY.has_value() ? createConstant(blockDimY.value()) : one;
-  Value blockSizeZ =
-      blockDimZ.has_value() ? createConstant(blockDimZ.value()) : one;
-  auto launchOp = rewriter.create<gpu::LaunchOp>(
-      loc, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
-  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-  rewriter.create<gpu::TerminatorOp>(loc);
-  return launchOp;
-}
-
-/// This is an helper that is only used in
-/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id
-static void generateGpuBlockIds(RewriterBase &rewriter,
-                                scf::ForeachThreadOp foreachOp,
-                                SmallVector<Value> &blockOps) {
-  Location loc = foreachOp->getLoc();
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(foreachOp);
-  IndexType indexType = rewriter.getIndexType();
-  SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
-                                      gpu::Dimension::z};
-  for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
-    blockOps.push_back(
-        rewriter.create<gpu::BlockIdOp>(loc, indexType, gpuDims[idx]));
-  }
-}
-
-DiagnosedSilenceableFailure
-transform::MapNestedForeachThreadToGpuBlocks::applyToOne(
-    Operation *target, SmallVectorImpl<Operation *> &results,
-    transform::TransformState &state) {
-  gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
-  SimpleRewriter rewriter(getContext());
-
-  if (!getGenerateGpuLaunch() && !gpuLaunch) {
-    target->emitError("Given target is not gpu.launch, set "
-                      "`generate_gpu_launch` attribute");
-    return DiagnosedSilenceableFailure::definiteFailure();
-  }
-
-  auto res = mlir::linalg::findTopLevelForeachThreadOp(target);
-  if (failed(res))
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-
-  scf::ForeachThreadOp topLevelForeachThreadOp = *res;
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(topLevelForeachThreadOp);
-
-  // Generate gpu launch here and move the foreach_thread inside
-  if (getGenerateGpuLaunch()) {
-    FailureOr<gpu::LaunchOp> maybeGpuLaunch =
-        createGpuLaunch(rewriter, target->getLoc());
-    if (failed(maybeGpuLaunch))
-      return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-    gpuLaunch = *maybeGpuLaunch;
-    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
-    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
-    rewriter.eraseOp(topLevelForeachThreadOp);
-    topLevelForeachThreadOp =
-        dyn_cast<scf::ForeachThreadOp>(newForeachThreadOp);
-  }
-
-  SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
-  if (failed(mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
-          rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim)))
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-
-  if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1],
-                            gridDim[2])))
-    return DiagnosedSilenceableFailure::definiteFailure();
-
-  results.assign({gpuLaunch});
-  return DiagnosedSilenceableFailure(success());
-}
-
 //===----------------------------------------------------------------------===//
 // TileToForeachThreadOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Linalg/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
rename from mlir/test/Dialect/Linalg/transform-gpu.mlir
rename to mlir/test/Dialect/GPU/transform-gpu.mlir
--- a/mlir/test/Dialect/Linalg/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -35,7 +35,7 @@
   transform.sequence %arg0 failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-    transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { blockDim = [12, 9, 1]}
+    transform.gpu.map_foreach_to_blocks %funcop { blockDim = [12, 9, 1]}
   }
 }
 
@@ -92,7 +92,7 @@
   transform.sequence %arg0 failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-    transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] }
+    transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1] }
   }
 }
 
@@ -134,8 +134,8 @@
   transform.sequence %arg0 failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %funcop = transform.structured.match ops{["func.func"]} in %arg0
-    %gpuLaunch = transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { generate_gpu_launch }
-    transform.structured.map_nested_foreach_thread_to_gpu_threads %gpuLaunch { blockDim = [32, 4, 1] }
+    %gpuLaunch = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
+    transform.gpu.map_nested_foreach_to_threads %gpuLaunch { blockDim = [32, 4, 1] }
   }
 }
 
@@ -171,6 +171,6 @@
   transform.sequence %arg0 failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-    transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
+    transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
   }
 }
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3761,6 +3761,65 @@
     ]),
 )
 
+td_library(
+    name = "GPUTransformOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td",
+    ],
+    includes = ["include"],
+    deps = [
+        ":PDLDialectTdFiles",
+        ":TransformDialectTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "GPUTransformOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td",
+    deps = [
+        ":GPUTransformOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "GPUTransformOps",
+    srcs = [
+        "lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ArithDialect",
+        ":AsmParser",
+        ":ControlFlowDialect",
+        ":GPUDialect",
+        ":GPUTransformOpsIncGen",
+        ":GPUTransforms",
+        ":IR",
+        ":PDLDialect",
+        ":Parser",
+        ":SCFDialect",
+        ":SideEffectInterfaces",
+        ":TransformDialect",
+        ":TransformUtils",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
     name = "LLVMOpsTdFiles",
     srcs = [
@@ -6401,6 +6460,7 @@
         ":GPUToROCDLTransforms",
         ":GPUToSPIRV",
         ":GPUToVulkanTransforms",
+        ":GPUTransformOps",
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",