diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -33,17 +33,32 @@
 namespace transform {
 namespace gpu {
 
+constexpr int64_t kWarpSize = 32;
+
+/// Helper type for functions that generate ids for the mapping of a
+/// scf.forall.
+using GpuIdBuilderFnType = llvm::function_ref<SmallVector<Value>(
+    RewriterBase &, scf::ForallOp, ArrayRef<int64_t> mappingDims)>;
+
+/// Helper struct for passing the mapping attributes and id generator to the
+/// common forall rewriter.
+struct GpuIdBuilder {
+  /// The mapping attributes targeted by this generator.
+  SmallVector<DeviceMappingAttrInterface> mappingAttributes;
+  /// The constructor that builds the concrete IR for mapping ids.
+  GpuIdBuilderFnType idBuilder;
+};
+
 /// Map the top level `scf.forall` op to GPU Thread Blocks.
 /// Mapping is one-to-one and the induction variables of `scf.forall` are
-/// rewritten to gpu.block_id according to the thread_dim_apping attribute.
+/// rewritten to gpu.block_id according to the thread_dim_mapping attribute.
 /// Dynamic, `scf.forall` trip counts are currently not supported.
 /// Dynamic block dim sizes are currently not supported.
-DiagnosedSilenceableFailure mapForallToBlocksImpl(
-    RewriterBase &rewriter, TransformOpInterface transformOp,
-    scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
-    const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes,
-    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
-        blockIdGenerator);
+DiagnosedSilenceableFailure
+mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,
+                      scf::ForallOp forallOp,
+                      SmallVectorImpl<int64_t> &gridDims,
+                      const GpuIdBuilder &gpuIdBuilder);
 
 /// Search `scf.forall` ops nested under `target` and map each such op to GPU
 /// threads. Mapping is one-to-one and the induction variables of `scf.forall`
@@ -56,10 +71,7 @@
 DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
     Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,
-    bool syncAfterDistribute,
-    const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes,
-    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
-        threadIdGenerator);
+    bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder);
 
 /// Find the unique top level scf::ForallOp within a given target op.
 DiagnosedSilenceableFailure
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -8,7 +8,9 @@
 
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
@@ -16,8 +18,10 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
@@ -27,6 +31,7 @@
 using namespace mlir;
 using namespace mlir::gpu;
 using namespace mlir::transform;
+using namespace mlir::transform::gpu;
 
 #define DEBUG_TYPE "gpu-transforms"
 
@@ -35,58 +40,87 @@
 
 namespace {
 
-/// Helper type for functions that generate ids for the mapping of a scf.forall.
-using IdGeneratorFnType = llvm::function_ref<void(RewriterBase &, scf::ForallOp,
-                                                  SmallVectorImpl<Value> &)>;
-
-struct MappingToGpuHelper {
-  MappingToGpuHelper(SmallVector<DeviceMappingAttrInterface> mappingAttributes,
-                     IdGeneratorFnType idGenerator)
-      : mappingAttributes(mappingAttributes), idGenerator(idGenerator) {}
+/// Return a flatten thread id for the workgroup with given sizes.
+static OpFoldResult getLinearThreadId(RewriterBase &rewriter, Location loc) {
+  AffineExpr tx, ty, tz, BDX, BDY;
+  bindDims(rewriter.getContext(), tx, ty, tz);
+  bindSymbols(rewriter.getContext(), BDX, BDY);
+  IndexType indexType = rewriter.getIndexType();
+  SmallVector<OpFoldResult> threadsAndWorkGroups{
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
+  threadsAndWorkGroups.push_back(
+      rewriter.create<BlockDimOp>(loc, indexType, Dimension::x).getResult());
+  threadsAndWorkGroups.push_back(
+      rewriter.create<BlockDimOp>(loc, indexType, Dimension::y).getResult());
+  return makeComposedFoldedAffineApply(
+      rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
+}
 
-  SmallVector<DeviceMappingAttrInterface> mappingAttributes;
-  IdGeneratorFnType idGenerator;
+struct GpuBlockIdBuilder : public GpuIdBuilder {
+
+  GpuBlockIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
+    mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
+                         GPUBlockMappingAttr::get(ctx, Blocks::DimY),
+                         GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
+    idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
+                   ArrayRef<int64_t> mappingDims) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPoint(forallOp);
+      IndexType indexType = rewriter.getIndexType();
+      auto loc = forallOp->getLoc();
+      return SmallVector<Value>{
+          rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
+          rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
+          rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
+    };
+  }
 };
 
-struct MappingToGpuBlocksHelper : public MappingToGpuHelper {
-
-  MappingToGpuBlocksHelper(MLIRContext *ctx)
-      : MappingToGpuHelper(
-            SmallVector<DeviceMappingAttrInterface>{
-                GPUBlockMappingAttr::get(ctx, Blocks::DimX),
-                GPUBlockMappingAttr::get(ctx, Blocks::DimY),
-                GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
-            IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,
-                                 SmallVectorImpl<Value> &ids) {
-              OpBuilder::InsertionGuard guard(rewriter);
-              rewriter.setInsertionPoint(forallOp);
-              IndexType indexType = rewriter.getIndexType();
-              auto loc = forallOp->getLoc();
-              ids.assign(
-                  {rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
-                   rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
-                   rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)});
-            }}) {}
+struct GpuThreadIdBuilder : public GpuIdBuilder {
+  GpuThreadIdBuilder(MLIRContext *ctx, SmallVector<Value> *idCaptures = nullptr)
+      : GpuIdBuilder() {
+    mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
+                         GPUThreadMappingAttr::get(ctx, Threads::DimY),
+                         GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
+    idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
+                   ArrayRef<int64_t> mappingDims) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPoint(forallOp);
+      IndexType indexType = rewriter.getIndexType();
+      auto loc = forallOp->getLoc();
+      return SmallVector<Value>{
+          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
+          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
+          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
+    };
+  }
 };
 
-struct MappingToGpuThreadsHelper : public MappingToGpuHelper {
-  MappingToGpuThreadsHelper(MLIRContext *ctx)
-      : MappingToGpuHelper(
-            SmallVector<DeviceMappingAttrInterface>{
-                GPUThreadMappingAttr::get(ctx, Threads::DimX),
-                GPUThreadMappingAttr::get(ctx, Threads::DimY),
-                GPUThreadMappingAttr::get(ctx, Threads::DimZ)},
-            IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,
-                                 SmallVectorImpl<Value> &ids) {
-              OpBuilder::InsertionGuard guard(rewriter);
-              rewriter.setInsertionPoint(forallOp);
-              IndexType indexType = rewriter.getIndexType();
-              auto loc = forallOp->getLoc();
-              ids.assign(
-                  {rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
-                   rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
-                   rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)});
-            }}) {}
+struct GpuWarpIdBuilder : public GpuIdBuilder {
+  GpuWarpIdBuilder(MLIRContext *ctx, SmallVector<Value> *idCaptures = nullptr)
+      : GpuIdBuilder() {
+    mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
+                         GPUWarpMappingAttr::get(ctx, Warps::DimY),
+                         GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
+    idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
+                   ArrayRef<int64_t> mappingDims) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPoint(forallOp);
+      Location loc = forallOp.getLoc();
+      Value warpId = rewriter.create<SubgroupIdOp>(loc);
+      SmallVector<int64_t> reverseBlockDims(llvm::reverse(mappingDims));
+      SmallVector<int64_t> strides = computeStrides(reverseBlockDims);
+      AffineExpr d0;
+      bindDims(rewriter.getContext(), d0);
+      SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+      SmallVector<Value> ids;
+      for (AffineExpr e : delinearizingExprs)
+        ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId));
+      return ids;
+    };
+  }
 };
 
 } // namespace
@@ -95,7 +129,7 @@
 failureHelper(std::optional<TransformOpInterface> transformOp,
               scf::ForallOp forallOp, const Twine &message) {
   if (transformOp.has_value())
-    return emitDefiniteFailure(*transformOp, message);
+    return transformOp->emitSilenceableError() << message;
   return emitDefiniteFailure(forallOp, message);
 }
 
@@ -114,9 +148,14 @@
       llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
         return attr.isa<GPUThreadMappingAttr>();
       });
+  bool hasWarpMapping =
+      llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
+        return attr.isa<GPUWarpMappingAttr>();
+      });
   int64_t countMappingTypes = 0;
   countMappingTypes += hasBlockMapping ? 1 : 0;
   countMappingTypes += hasThreadMapping ? 1 : 0;
+  countMappingTypes += hasWarpMapping ? 1 : 0;
   if (countMappingTypes > 1) {
     return failureHelper(transformOp, forallOp,
                          "cannot mix different mapping types, use nesting");
@@ -163,9 +202,9 @@
   return DiagnosedSilenceableFailure::success();
 }
 
-/// Determines if the size of the kernel configuration is supported by the GPU
-/// architecture being used. It presently makes use of CUDA limitations, however
-/// that aspect may be enhanced for other GPUs.
+/// Determines if the size of the kernel configuration is supported by the
+/// GPU architecture being used. It presently makes use of CUDA limitations,
+/// however that aspect may be enhanced for other GPUs.
 static DiagnosedSilenceableFailure checkGpuLimits(
     TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
     std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
@@ -201,8 +240,8 @@
   return DiagnosedSilenceableFailure::success();
 }
 
-/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and
-/// put a terminator within.
+/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
+/// and put a terminator within.
 static DiagnosedSilenceableFailure
 createGpuLaunch(RewriterBase &rewriter, Location loc,
                 TransformOpInterface transformOp, LaunchOp &launchOp,
@@ -278,21 +317,41 @@
   return DiagnosedSilenceableFailure::success();
 }
 
+/// Struct to return the result of the rewrite of a forall operation.
+struct ForallRewriteResult {
+  SmallVector<int64_t> mappingSizes;
+  SmallVector<Value> mappingIds;
+};
+
+/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
+static void
+replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
+                            ValueRange mappingIds,
+                            ArrayRef<int64_t> availableMappingSizes) {
+  assert(!mappingIds.empty() && "expected some mapping");
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(mappingIds.front().getDefiningOp());
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  for (auto [dim, id] : llvm::zip_equal(availableMappingSizes, mappingIds)) {
+    if (dim == 1)
+      rewriter.replaceAllUsesWith(id, zero);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // MapForallToBlocks
 //===----------------------------------------------------------------------===//
 
-static FailureOr<SmallVector<int64_t>> rewriteOneForallCommonImpl(
+static FailureOr<ForallRewriteResult> rewriteOneForallCommonImpl(
     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
     scf::ForallOp forallOp,
     const SmallVectorImpl<int64_t> &availableMappingSizes,
-    const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
-    IdGeneratorFnType idGenerator) {
+    const GpuIdBuilder &gpuIdBuilder) {
   LDBG("Start rewriteOneForallCommonImpl");
 
   // Step 0. GPU-specific verifications. There is no better place to anchor
-  // those right now: the ForallOp is target-independent and the transform op
-  // does not apply to individual ForallOp.
+  // those right now: the ForallOp is target-independent and the transform
+  // op does not apply to individual ForallOp.
   DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);
   if (!diag.succeeded())
     return failure();
@@ -306,14 +365,15 @@
       }));
   SmallVector<Attribute> forallMappings =
       llvm::to_vector(forallOp.getMapping()->getValue());
-  for (auto attr : allMappingAttributes) {
+  for (auto attr : gpuIdBuilder.mappingAttributes) {
     if (llvm::is_contained(forallMappings, attr))
       continue;
     forallMappings.push_back(attr);
     tmpMappingSizes.push_back(1);
   }
 
-  // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
+  // Step 2. sort the values by the corresponding
+  // DeviceMappingAttrInterface.
   auto comparator = [&](DeviceMappingAttrInterface a,
                         DeviceMappingAttrInterface b) -> bool {
     return a.getMappingId() < b.getMappingId();
@@ -325,22 +385,13 @@
              llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: ");
              llvm::dbgs() << "\n");
 
-  // Step 3. Generate the mappingIdOps using the provided generator and map the
-  // induction variables to the newly created ops. Replace ids of dimension
-  // known to be of size 1 by zero to simplify the IR.
-  SmallVector<Value> mappingIdOps;
-  Location loc = forallOp.getLoc();
-  idGenerator(rewriter, forallOp, mappingIdOps);
+  // Step 3. Generate the mappingIdOps using the provided generator and map
+  // the induction variables to the newly created ops.
+  SmallVector<Value> mappingIdOps =
+      gpuIdBuilder.idBuilder(rewriter, forallOp, mappingSizes);
   LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: ");
              llvm::dbgs() << "\n");
   assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes");
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  if (!availableMappingSizes.empty()) {
-    for (size_t i : llvm::seq(size_t(0), availableMappingSizes.size())) {
-      if (availableMappingSizes[i] == 1)
-        mappingIdOps[i] = zero;
-    }
-  }
 
   IRMapping bvm;
   for (auto [iv, dim] :
@@ -354,6 +405,7 @@
 
   // Step 4. Maybe create conditionals to predicate the region.
   // Skip this step when availableMappingSizes is empty.
+  Location loc = forallOp.getLoc();
   Value predicate;
   if (!availableMappingSizes.empty()) {
     LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes,
@@ -389,12 +441,13 @@
   Block::iterator insertionPoint;
   if (predicate) {
     // Step 5.a. If predicated, move at the beginning.
-    auto ifOp =
-        rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
+    auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
+                                           /*withElseRegion=*/false);
     targetBlock = ifOp.thenBlock();
     insertionPoint = ifOp.thenBlock()->begin();
   } else {
-    // Step 5.b. Otherwise, move inline just at the rewriter insertion point.
+    // Step 5.b. Otherwise, move inline just at the rewriter insertion
+    // point.
     targetBlock = forallOp->getBlock();
     insertionPoint = rewriter.getInsertionPoint();
   }
@@ -411,23 +464,30 @@
   // Step 7. Erase old op.
   rewriter.eraseOp(forallOp);
 
-  return mappingSizes;
+  return ForallRewriteResult{mappingSizes, mappingIdOps};
 }
 
 DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
     RewriterBase &rewriter, TransformOpInterface transformOp,
     scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
-    const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
-    IdGeneratorFnType idGenerator) {
+    const GpuIdBuilder &gpuIdBuilder) {
+
   // Pass an empty anyAvailableMappingSizes.
+  Location loc = forallOp.getLoc();
   SmallVector<int64_t> anyAvailableMappingSizes;
-  FailureOr<SmallVector<int64_t>> maybeMappingSizes =
-      rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,
-                                 anyAvailableMappingSizes, allMappingAttributes,
-                                 idGenerator);
-  if (failed(maybeMappingSizes))
+  FailureOr<ForallRewriteResult> rewriteResult = rewriteOneForallCommonImpl(
+      rewriter, transformOp, forallOp, anyAvailableMappingSizes, gpuIdBuilder);
+
+  // Fail if anything goes wrong.
+  if (failed(rewriteResult))
     return DiagnosedSilenceableFailure::definiteFailure();
-  gridDims = *maybeMappingSizes;
+  gridDims = rewriteResult->mappingSizes;
+
+  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
+  // Here, the result of mapping determines the available mapping sizes.
+  replaceUnitMappingIdsHelper(rewriter, loc, rewriteResult->mappingIds,
+                              gridDims);
+
   return DiagnosedSilenceableFailure::success();
 }
 
@@ -500,10 +560,9 @@
   if (!diag.succeeded())
     return diag;
 
-  MappingToGpuBlocksHelper helper(getContext());
+  GpuBlockIdBuilder gpuBlockIdBuilder(getContext());
   diag = mlir::transform::gpu::mapForallToBlocksImpl(
-      rewriter, transformOp, topLevelForallOp, gridDims,
-      helper.mappingAttributes, helper.idGenerator);
+      rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
   if (!diag.succeeded())
     return diag;
 
@@ -522,30 +581,36 @@
 DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
     Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,
-    bool syncAfterDistribute,
-    const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
-    IdGeneratorFnType idGenerator) {
+    bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) {
   DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
   target->walk([&](scf::ForallOp forallOp) {
-    // Ignore cases with different attributes.
+    // Ignore cases with different attributes than this builder supports.
     for (Attribute map : forallOp.getMapping()->getValue()) {
-      if (!llvm::is_contained(allMappingAttributes, map)) {
+      if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) {
         return WalkResult::skip();
       }
     }
     diag = verifyGpuMapping(transformOp, forallOp);
     if (diag.succeeded()) {
-      // Take the loc ahead of time
       Location loc = forallOp.getLoc();
       OpBuilder::InsertionGuard g(rewriter);
+      // Insert after to allow for syncthreads after `forall` is erased.
       rewriter.setInsertionPointAfter(forallOp);
-      if (failed(rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,
-                                            kernelBlockDims,
-                                            allMappingAttributes, idGenerator)))
+      FailureOr<ForallRewriteResult> rewriteResult = rewriteOneForallCommonImpl(
+          rewriter, transformOp, forallOp, kernelBlockDims, gpuIdBuilder);
+
+      // Fail if anything goes wrong.
+      if (failed(rewriteResult))
         diag = DiagnosedSilenceableFailure::definiteFailure();
+
       // Add a syncthreads if needed. TODO: warpsync
       if (syncAfterDistribute)
         rewriter.create<BarrierOp>(loc);
+
+      // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
+      // Here, the available mapping sizes are given by `kernelBlockDims`.
+      replaceUnitMappingIdsHelper(rewriter, loc, rewriteResult->mappingIds,
+                                  kernelBlockDims);
     }
     return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
   });
@@ -575,10 +640,11 @@
 
   MLIRContext *ctx = getContext();
   IRRewriter rewriter(ctx);
-  MappingToGpuThreadsHelper helper(ctx);
+  SmallVector<Value> idCaptures;
+  GpuThreadIdBuilder gpuThreadIdBuilder(ctx, &idCaptures);
   diag = mlir::transform::gpu::mapNestedForallToThreadsImpl(
       rewriter, transformOp, target, blockDims, getSyncAfterDistribute(),
-      helper.mappingAttributes, helper.idGenerator);
+      gpuThreadIdBuilder);
 
   if (!diag.succeeded())
     return diag;