diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -53,6 +53,7 @@
 /// dim sizes are currently not supported.
 DiagnosedSilenceableFailure mapForeachToBlocksImpl(
     RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    SmallVector<int64_t> mapping,
     function_ref<void(RewriterBase &, scf::ForeachThreadOp,
                       SmallVectorImpl<Value> &)>
         blockIdGenerator,
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -160,6 +160,65 @@
 
   let arguments = (ins PDL_Operation:$target,
                    DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
+                   UnitAttr:$generate_gpu_launch);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+def MapForeach : 
+  Op<Transform_Dialect, "gpu.map_foreach",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait]> {
+  let description = [{
+    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
+    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 
+    is set, then first generates `gpu_launch` and moves the top level 
+    `scf.foreach_thread` inside.
+
+    The operation searches top level `scf.foreach_thread` ops under 
+    `gpu_launch` and maps each such op to GPU blocks. Mapping is 
+    one-to-one and the induction variables of `scf.foreach_thread` are 
+    rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
+
+    Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
+    Dynamic block dim sizes are currently not supported.
+
+    Only **bufferized** scf.foreach_thread are currently supported.
+    Only scf.foreach_thread distributed to **at most 3 dimensions** are 
+    currently supported.
+
+    The operation alters the block size of the given gpu_launch using 
+    gridDim argument.
+
+    #### Return modes:
+    
+    This operation ignores non-gpu_launch ops and drops them in the return.
+
+    If any scf.foreach_thread with tensors is found, the transform definitely 
+    fails.    
+
+    If all the scf.foreach_thread operations contained within the LaunchOp 
+    referred to by the `target` PDLOperation lower to GPU properly, the 
+    transform succeeds. Otherwise the transform definitely fails.
+
+    The returned handle points to the same LaunchOp operand, consuming it and
+    producing a new SSA value to satisfy chaining and linearity of the IR 
+    properties.
+  }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
                    UnitAttr:$generate_gpu_launch);
   let results = (outs PDL_Operation:$result);
 
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -155,6 +155,7 @@
 
 DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
     RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    SmallVector<int64_t> mapping,
     function_ref<void(RewriterBase &, scf::ForeachThreadOp,
                       SmallVectorImpl<Value> &)>
         blockIdGenerator,
@@ -168,18 +169,6 @@
 
   // Step 0. Outline the compute workload region and set up the workload
   // operands.
-  SmallVector<int64_t> mapping;
-  if (!foreachThreadOp.getMapping().has_value())
-    return transformOp.emitSilenceableError() << "Mapping must be present";
-  for (auto map : *foreachThreadOp.getMapping()) {
-    if (auto blockMap = map.dyn_cast<GPUBlockMappingAttr>()) {
-      mapping.push_back((int64_t)blockMap.getBlock());
-    } else {
-      return transformOp.emitSilenceableError()
-             << "Mapping must be #gpu.block<x/y/z/>";
-    }
-  }
-
   FailureOr<SmallVector<OpFoldResult>> potentialGridDim =
       foreachThreadOp.getPermutedNumThreads(rewriter, mapping);
 
@@ -260,6 +249,49 @@
   }
 }
 
+/// This is a helper that is only used in
+/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects
+/// block_id * block_dim + thread_id.
+static void generateGpuGlobalIds(RewriterBase &rewriter,
+                                 scf::ForeachThreadOp foreachOp,
+                                 SmallVectorImpl<Value> &blockOps) {
+  Location loc = foreachOp->getLoc();
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(foreachOp);
+  IndexType indexType = rewriter.getIndexType();
+  SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
+  for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
+    auto bidx = rewriter.create<BlockIdOp>(loc, indexType, gpuDims[idx]);
+    auto tidx = rewriter.create<ThreadIdOp>(loc, indexType, gpuDims[idx]);
+    auto bdimx = rewriter.create<BlockDimOp>(loc, indexType, gpuDims[idx]);
+    auto bid = rewriter.create<arith::MulIOp>(loc, bdimx, bidx);
+    auto globalid = rewriter.create<arith::AddIOp>(loc, bid, tidx);
+    blockOps.push_back(globalid);
+  }
+}
+
+static LogicalResult fill_blocks(mlir::ArrayAttr maps,
+                                 SmallVector<int64_t> &mapping) {
+  for (auto map : maps) {
+    if (auto blockMap = map.dyn_cast<GPUBlockMappingAttr>())
+      mapping.push_back((int64_t)blockMap.getBlock());
+    else
+      return failure();
+  }
+  return success();
+}
+
+static LogicalResult fill_global(mlir::ArrayAttr maps,
+                                 SmallVector<int64_t> &mapping) {
+  for (auto map : maps) {
+    if (auto blockMap = map.dyn_cast<GPUGlobalMappingAttr>())
+      mapping.push_back((int64_t)blockMap.getGlobal());
+    else
+      return failure();
+  }
+  return success();
+}
+
 DiagnosedSilenceableFailure
 transform::MapForeachToBlocks::applyToOne(Operation *target,
                                           SmallVectorImpl<Operation *> &results,
@@ -305,9 +337,22 @@
     topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
   }
 
+  SmallVector<int64_t> mapping;
+  auto maps = topLevelForeachThreadOp.getMapping();
+  if (!maps.has_value() || maps->empty()) {
+    results.assign({target});
+    return emitSilenceableError() << "Mapping must be present, which can be "
+                                     "#gpu.block or #gpu.global";
+  }
+  if (failed(fill_blocks(maps.value(), mapping))) {
+    results.assign({target});
+    return emitSilenceableError()
+           << "Mapping must be same type of #gpu.global or #gpu.block";
+  }
+
   SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
   diag = mlir::transform::gpu::mapForeachToBlocksImpl(
-      rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim,
+      rewriter, topLevelForeachThreadOp, mapping, generateGpuBlockIds, gridDim,
       transformOp);
   if (diag.succeeded()) {
     gridDim.resize(3, 1);
@@ -504,6 +549,108 @@
   return diag;
 }
 
+//===----------------------------------------------------------------------===//
+// MapForeach
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure
+transform::MapForeach::applyToOne(Operation *target,
+                                  SmallVectorImpl<Operation *> &results,
+                                  transform::TransformState &state) {
+  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
+  SimpleRewriter rewriter(getContext());
+  auto transformOp = cast<TransformOpInterface>(getOperation());
+
+  if (!getGenerateGpuLaunch() && !gpuLaunch) {
+    results.assign({target});
+    DiagnosedSilenceableFailure diag =
+        emitSilenceableError()
+        << "Given target is not gpu.launch, set `generate_gpu_launch` "
+           "attribute";
+    diag.attachNote(target->getLoc()) << "when applied to this payload op";
+    return diag;
+  }
+
+  scf::ForeachThreadOp topLevelForeachThreadOp;
+  DiagnosedSilenceableFailure diag =
+      mlir::transform::gpu::findTopLevelForeachThreadOp(
+          target, topLevelForeachThreadOp, transformOp);
+  if (!diag.succeeded()) {
+    results.assign({target});
+    diag.attachNote(target->getLoc()) << "when applied to this payload op";
+    return diag;
+  }
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(topLevelForeachThreadOp);
+
+  // Generate gpu launch here and move the foreach_thread inside
+  if (getGenerateGpuLaunch()) {
+    DiagnosedSilenceableFailure diag =
+        createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
+    if (!diag.succeeded()) {
+      results.assign({target});
+      return diag;
+    }
+    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
+    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
+    rewriter.eraseOp(topLevelForeachThreadOp);
+    topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
+  }
+
+  SmallVector<int64_t> mapping;
+  SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
+  SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
+  if (blockDim.empty()) {
+    results.assign({target});
+    return emitSilenceableError() << getBlockDimAttrName() << " is missing";
+  }
+
+  auto maps = topLevelForeachThreadOp.getMapping();
+  if (!maps.has_value() || maps->empty()) {
+    results.assign({target});
+    return emitSilenceableError() << "Mapping must be present, which can be "
+                                     "#gpu.block or #gpu.global";
+  }
+  bool globalIdMapping = false;
+  if (failed(fill_blocks(maps.value(), mapping))) {
+    if (failed(fill_global(maps.value(), mapping))) {
+      results.assign({target});
+      return emitSilenceableError()
+             << "Mapping must be same type of #gpu.global or #gpu.block";
+    }
+    globalIdMapping = true;
+  }
+
+  diag = mlir::transform::gpu::mapForeachToBlocksImpl(
+      rewriter, topLevelForeachThreadOp, mapping,
+      globalIdMapping ? generateGpuGlobalIds : generateGpuBlockIds, gridDim,
+      transformOp);
+  if (diag.succeeded()) {
+    // process inner loops
+    diag = gpu::mapNestedForeachToThreadsImpl(rewriter, gpuLaunch, blockDim,
+                                              true, transformOp);
+    if (!diag.succeeded()) {
+      results.assign({gpuLaunch});
+      return diag;
+    }
+
+    gridDim.resize(3, 1);
+    auto calcBlockSize = [=](int64_t gridX, int64_t blockX) {
+      return globalIdMapping ? gridX / blockX + (gridX % blockX) : gridX;
+    };
+    diag = alterGpuLaunch(rewriter, gpuLaunch,
+                          cast<TransformOpInterface>(getOperation()),
+                          calcBlockSize(gridDim[0], blockDim[0]),
+                          calcBlockSize(gridDim[1], blockDim[1]),
+                          calcBlockSize(gridDim[2], blockDim[2]), blockDim[0],
+                          blockDim[1], blockDim[2]);
+  }
+
+  results.assign({gpuLaunch});
+  return diag;
+}
+
 //===----------------------------------------------------------------------===//
 // Transform op registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -162,3 +162,102 @@
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
 }
+
+
+// -----
+
+!type4d = memref<32x64x32xf32>
+
+// CHECK-LABEL: func.func @saxpyglobal(
+// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x64x32xf32>
+// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x64x32xf32>
+func.func @saxpyglobal(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
+  %c32 = arith.constant 32 : index
+  %c64 = arith.constant 64 : index
+  %c4 = arith.constant 4 : index
+//      CHECK:   %[[C4:.*]] = arith.constant 4 : index
+//      CHECK:   %[[C8:.*]] = arith.constant 8 : index
+//      CHECK:   %[[C16:.*]] = arith.constant 16 : index
+//      CHECK:   gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C16]], %{{.*}} = %[[C8]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C4]], %{{.*}} = %[[C4]], %{{.*}} = %[[C4]])
+//      CHECK:   %[[BLKX:.*]] = gpu.block_id  x
+//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
+//      CHECK:   %[[BDIMX:.*]] = gpu.block_dim  x
+//      CHECK:   %[[MULX:.*]] = arith.muli %[[BDIMX]],  %[[BLKX]] : index
+//      CHECK:   %[[IDX:.*]] = arith.addi %[[MULX]],  %[[TIDX]] : index
+//      CHECK:   %[[BLKY:.*]] = gpu.block_id  y
+//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
+//      CHECK:   %[[BDIMY:.*]] = gpu.block_dim  y
+//      CHECK:   %[[MULY:.*]] = arith.muli %[[BDIMY]],  %[[BLKY]] : index
+//      CHECK:   %[[IDY:.*]] = arith.addi %[[MULY]],  %[[TIDY]] : index
+//      CHECK:   %[[BLKZ:.*]] = gpu.block_id  z
+//      CHECK:   %[[TIDZ:.*]] = gpu.thread_id  z
+//      CHECK:   %[[BDIMZ:.*]] = gpu.block_dim  z
+//      CHECK:   %[[MULZ:.*]] = arith.muli %[[BDIMZ]],  %[[BLKZ]] : index
+//      CHECK:   %[[IDZ:.*]] = arith.addi %[[MULZ]],  %[[TIDZ]] : index
+//      CHECK:   memref.load %[[ARGX]][%[[IDZ]], %[[IDY]], %[[IDX]]]
+//      CHECK:   memref.load %[[ARGY]][%[[IDZ]], %[[IDY]], %[[IDX]]]
+  scf.foreach_thread (%i, %j, %k) in (%c32, %c64, %c32) {
+      %4 = memref.load %x[%i, %j, %k] : !type4d
+      %5 = memref.load %y[%i, %j, %k] : !type4d
+      %6 = math.fma %alpha, %4, %5 : f32
+      memref.store %6, %y[%i, %j, %k] : !type4d
+  }  { mapping = [#gpu.global<z>, #gpu.global<y>, #gpu.global<x>] }
+  return %y : !type4d
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !pdl.operation):
+  %funcop = transform.structured.match ops{["func.func"]} in %arg0
+  %gpuLaunch = transform.gpu.map_foreach %funcop { generate_gpu_launch, blockDim = [4, 4, 4] }
+}
+
+// -----
+
+!type4d = memref<32x64x4x32xf32>
+
+// CHECK-LABEL: func.func @saxpy4d(
+// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32>
+// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32>
+// CHECK-SAME:    %[[ARGA:[0-9a-z]+]]: memref<32x64x4x32xf32>
+// CHECK-SAME:    %[[ARGB:[0-9a-z]+]]: memref<32x64x4x32xf32>
+func.func @saxpy4d(%x: !type4d, %y: !type4d, %a: !type4d, %b: !type4d, %alpha : f32) -> !type4d {
+  %c32 = arith.constant 32 : index
+  %c64 = arith.constant 64 : index
+  %c4 = arith.constant 4 : index
+//      CHECK:   %[[C32:.*]] = arith.constant 32 : index
+//      CHECK:   %[[C64:.*]] = arith.constant 64 : index
+//      CHECK:   %[[C4:.*]] = arith.constant 4 : index
+//      CHECK:   %[[C1:.*]] = arith.constant 1 : index
+//      CHECK:   gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]])
+//      CHECK:   %[[BLKX:.*]] = gpu.block_id  x
+//      CHECK:   %[[BLKY:.*]] = gpu.block_id  y
+//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
+//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
+//      CHECK:   memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
+//      CHECK:   memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
+//      CHECK:   gpu.barrier
+//      CHECK:   memref.load %[[ARGA]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
+//      CHECK:   memref.load %[[ARGB]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
+  scf.foreach_thread (%i, %j) in (%c32, %c64) {
+    scf.foreach_thread (%k, %l) in (%c4, %c32) {
+      %4 = memref.load %x[%i, %j, %k, %l] : !type4d
+      %5 = memref.load %y[%i, %j, %k, %l] : !type4d
+      %6 = math.fma %alpha, %4, %5 : f32
+      memref.store %6, %y[%i, %j, %k, %l] : !type4d
+    }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
+    scf.foreach_thread (%k, %l) in (%c4, %c32) {
+      %4 = memref.load %a[%i, %j, %k, %l] : !type4d
+      %5 = memref.load %b[%i, %j, %k, %l] : !type4d
+      %6 = math.fma %alpha, %4, %5 : f32
+      memref.store %6, %b[%i, %j, %k, %l] : !type4d
+    }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
+  }  { mapping = [#gpu.block<x>, #gpu.block<y>] }
+
+  return %y : !type4d
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !pdl.operation):
+  %funcop = transform.structured.match ops{["func.func"]} in %arg0
+  %gpuLaunch = transform.gpu.map_foreach %funcop { generate_gpu_launch, blockDim = [32, 4, 1] }
+}
\ No newline at end of file