diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -114,6 +114,20 @@
   let assemblyFormat = "attr-dict";
 }
 
+def GPU_LinearIdOp : GPU_Op<"linear_id", [
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+  let description = [{
+    Returns the linearized id within the workgroup (block).
+
+    Example:
+    ```mlir
+    %laneId = gpu.lane_id
+    ```
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
 def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
@@ -64,6 +64,22 @@
   }];
 }
 
+def LinearIdEnum : I64EnumAttr<"LinearId", "threads for loop mapping", [
+    DimX, DimY, DimZ]> {
+  let cppNamespace = "::mlir::gpu";
+}
+
+def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [
+  DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
+  let parameters = (ins
+    EnumParameter<LinearIdEnum>:$linear_id
+  );
+  let assemblyFormat = "`<` params `>`";
+  let description = [{
+    An attribute that allows defining thread parallelism for GPU devices.
+  }];
+}
+
 def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [
     DimX, DimY, DimZ]> {
   let cppNamespace = "::mlir::gpu";
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -114,6 +114,7 @@
   let arguments = (ins PDL_Operation:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$block_dims,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$linear_dims,
                    DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute);
   let results = (outs PDL_Operation:$result);
 
@@ -121,6 +122,7 @@
     $target
     `block_dims` `=` $block_dims
     (`warp_dims` `=` $warp_dims^)?
+    (`linear_dims` `=` $linear_dims^)?
     (`sync_after_distribute` `=` $sync_after_distribute^)?
     attr-dict
   }];
@@ -132,7 +134,6 @@
   }];
 }
 
-
 def MapForallToBlocks :
   Op<Transform_Dialect, "gpu.map_forall_to_blocks",
     [FunctionalStyleTransformOpTrait,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -46,6 +46,10 @@
   return static_cast<int64_t>(getWarp());
 }
 
+int64_t GPULinearIdMappingAttr::getMappingId() const {
+  return static_cast<int64_t>(getLinearId());
+}
+
 int64_t GPUThreadMappingAttr::getMappingId() const {
   return static_cast<int64_t>(getThread());
 }
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -118,6 +118,11 @@
   setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));
 }
 
+void LinearIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
+                                   SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));
+}
+
 void SubgroupIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                      SetIntRangeFn setResultRange) {
   setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL));
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -147,6 +147,45 @@
   }
 };
 
+struct GpuLinearIdBuilder : public GpuIdBuilder {
+  GpuLinearIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
+    mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
+                         GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
+                         GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
+    idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
+                   ArrayRef<int64_t> mappingDims) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPoint(forallOp);
+      Location loc = forallOp.getLoc();
+      Value linearIdOp = rewriter.create<LinearIdOp>(loc);
+      SmallVector<int64_t> reverseBlockDims(llvm::reverse(mappingDims));
+      LLVM_DEBUG(llvm::interleaveComma(reverseBlockDims,
+                                       DBGS() << "--delinearization basis: ");
+                 llvm::dbgs() << "\n");
+
+      SmallVector<int64_t> strides = computeStrides(reverseBlockDims);
+      LLVM_DEBUG(llvm::interleaveComma(strides,
+                                       DBGS() << "--delinearization strides: ");
+                 llvm::dbgs() << "\n");
+
+      AffineExpr d0;
+      bindDims(rewriter.getContext(), d0);
+      SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+      LLVM_DEBUG(llvm::interleaveComma(delinearizingExprs,
+                                       DBGS() << "--delinearization exprs: ");
+                 llvm::dbgs() << "\n");
+
+      SmallVector<Value> ids;
+      for (AffineExpr e : delinearizingExprs)
+        ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearIdOp));
+      LLVM_DEBUG(llvm::interleaveComma(ids, DBGS() << "--ids: ");
+                 llvm::dbgs() << "\n");
+
+      return ids;
+    };
+  }
+};
+
 } // namespace
 
 static DiagnosedSilenceableFailure
@@ -732,6 +771,34 @@
     rewriter.replaceAllUsesWith(subgroupIdOp, warpId.get<Value>());
   });
 
+  SmallVector<int64_t> linearDims{getLinearDims()};
+  if (!linearDims.empty()) {
+    if (linearDims.size() != 3)
+      return transformOp.emitDefiniteFailure("requires size-3 linear mapping");
+  }
+
+  LLVM_DEBUG(
+      llvm::interleaveComma(
+          linearDims, DBGS() << "mapNestedForallToThreadsImpl linearDims: ");
+      llvm::dbgs() << "\n");
+  GpuLinearIdBuilder gpuLinearIdBuilder(ctx);
+  diag = mlir::transform::gpu::mapNestedForallToThreadsImpl(
+      rewriter, transformOp, target, linearDims, getSyncAfterDistribute(),
+      gpuLinearIdBuilder);
+  if (!diag.succeeded())
+    return diag;
+
+  // Perform a late replacement of the SubgroupIdOp, taking into account the
+  // blockDims.
+  target->walk([&](LinearIdOp linearIdOp) {
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(linearIdOp);
+    auto linearThreadId =
+        getStaticLinearThreadId(rewriter, linearIdOp.getLoc(), blockDimsOfr);
+    LDBG("----linearThreadId: " << linearThreadId);
+    rewriter.replaceAllUsesWith(linearIdOp, linearThreadId.get<Value>());
+  });
+
   results.push_back(gpuLaunch.getOperation());
   return diag;
 }
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -239,16 +239,22 @@
 // CHECK-DAG: #[[MAPWY:.*]] = affine_map<(d0, d1, d2) -> (((d0 + d1 * 12 + d2 * 132) floordiv 32) floordiv 4)>
 // CHECK-DAG: #[[MAPWX:.*]] = affine_map<(d0, d1, d2) -> ((((d0 + d1 * 12 + d2 * 132) floordiv 32) mod 4) floordiv 2)>
 
+// CHECK-DAG: #[[MAPLX:.*]] = affine_map<(d0, d1, d2) -> (d2 * 4 + (d0 + d1 * 12) floordiv 33)>
+// CHECK-DAG: #[[MAPLY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 33) floordiv 11)>
+// CHECK-DAG: #[[MAPLZ:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) mod 11)>
+
 // CHECK-LABEL: func.func @map_multi_level(
 func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
   %one = arith.constant 1 : index
-  %c12 = arith.constant 12 : index
+  %c10 = arith.constant 10 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
 
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
 
   // check that both the thread level and the warp level got distributed.
   //  CHECK-NOT: #gpu.thread
@@ -277,6 +283,21 @@
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
      }  {mapping = [#gpu.warp<x>] }
+
+    // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[MAPLX]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+    // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[MAPLY]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[MAPLZ]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[LIDX]], %[[C10]] : index
+    // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[LIDY]], %[[C2]] : index
+    //     CHECK: %[[CONDXY:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
+    // CHECK-DAG: %[[CMPZ:.*]] = arith.cmpi ult, %[[LIDZ]], %[[C1]] : index
+    //     CHECK: %[[CONDXYZ:.*]] = arith.andi %[[CONDXY]], %[[CMPZ]] : i1
+    //     CHECK: scf.if %[[CONDXYZ]]
+    scf.forall (%i, %j) in (%c10, %c2) {
+        %7 = memref.load %t[%i] : !type1d
+        %8 = arith.addf %alpha, %7 : f32
+        memref.store %8, %t[%j] : !type1d
+     }  {mapping = [#gpu.linear<x>, #gpu.linear<y>] }
     gpu.terminator
   }
   return %y : !type
@@ -286,5 +307,5 @@
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   transform.gpu.map_nested_forall_to_threads %funcop
-    block_dims = [12, 11, 1] warp_dims = [2, 2, 1]
+    block_dims = [12, 11, 1] warp_dims = [2, 2, 1] linear_dims = [11, 3, 4]
 }