diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -33,40 +33,37 @@
 namespace transform {
 namespace gpu {
 
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// Searches `scf.forall` ops nested under `target` and maps each such
 /// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// `scf.forall` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.forall` are supported in
 /// which case, the union of the number of threads is computed and may result in
-/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
+/// predication. Dynamic, `scf.forall` trip counts are currently not
 /// supported. Dynamic block dim sizes are currently not supported.
 DiagnosedSilenceableFailure mapNestedForeachToThreadsImpl(
     RewriterBase &rewriter, Operation *target,
     const SmallVectorImpl<int64_t> &blockDim,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         threadIdGenerator,
     bool syncAfterDistribute, std::optional<TransformOpInterface> transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes);
 
-/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
-/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
+/// Maps the top level `scf.forall` op to GPU Thread Blocks. Mapping is
+/// one-to-one and the induction variables of `scf.forall` are rewritten
 /// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
-/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
+/// `scf.forall` trip counts are currently not supported. Dynamic block
 /// dim sizes are currently not supported.
 DiagnosedSilenceableFailure mapForeachToBlocksImpl(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    RewriterBase &rewriter, scf::ForallOp forallOp,
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         blockIdGenerator,
     SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes);
 
-/// Finds the top level scf::ForeachThreadOp of given target.
+/// Finds the top level scf::ForallOp of given target.
 DiagnosedSilenceableFailure
-findTopLevelForeachThreadOp(Operation *target,
-                            scf::ForeachThreadOp &topLevelForeachThreadOp,
-                            TransformOpInterface transformOp);
+findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
+                     TransformOpInterface transformOp);
 
 } // namespace gpu
 } // namespace transform
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -22,30 +22,30 @@
      TransformEachOpTrait,
      TransformOpInterface]> {
   let description = [{
-      Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
+      Target the `gpu.launch op` and rewrite all `scf.forall`
       nested in it to distributed `gpu.thread_id` attribute.
 
-      The operation searches for `scf.foreach_thread` ops nested under `target`
+      The operation searches for `scf.forall` ops nested under `target`
       and maps each such op to GPU threads. Mapping is one-to-one and the
-      induction variables of `scf.foreach_thread` are rewritten to
+      induction variables of `scf.forall` are rewritten to
       `gpu.thread_id` according to the `mapping` attribute.
 
-      Sibling `scf.foreach_thread` are supported in which case, the union of
+      Sibling `scf.forall` are supported in which case, the union of
       the number of threads is computed and may result in predication.
 
-      Multiple scf.foreach_thread are supported per `gpu.launch` in which case,
+      Multiple scf.forall are supported per `gpu.launch` in which case,
       the max of all the threads is computed and taken for the global
-      `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the
+      `gpu.thread_id`. If necessary, `scf.forall` that do not use the
       whole thread range result in predicated computations.
 
-      Dynamic `scf.foreach_thread` trip counts are currently not supported.
+      Dynamic `scf.forall` trip counts are currently not supported.
       Dynamic block dim sizes are currently not supported.
 
-      Only **bufferized** `scf.foreach_thread` are currently supported.
-      Only `scf.foreach_thread` distributed to **at most 3 dimensions** are
+      Only **bufferized** `scf.forall` are currently supported.
+      Only `scf.forall` distributed to **at most 3 dimensions** are
       currently supported.
 
-      Barriers are inserted after each scf.foreach_thread op for now.
+      Barriers are inserted after each scf.forall op for now.
 
       The operation alters the block size of the given gpu_launch using
       blockDim argument.
@@ -54,15 +54,15 @@
 
       This operation ignores non-gpu_launch ops and drops them in the return.
 
-      If any scf.foreach_thread with tensors is found, the transform definitely
+      If any scf.forall with tensors is found, the transform definitely
       fails.
 
-      If all the scf.foreach_thread operations with gpu.thread mapping contained
+      If all the scf.forall operations with gpu.thread mapping contained
       within the LaunchOp referred to by the `target` PDLOperation lower to GPU
       properly, the transform succeeds. Otherwise the transform definitely
       fails.
 
-      scf.foreach_thread operations with mappings other than gpu.thread are
+      scf.forall operations with mappings other than gpu.thread are
       ignored.
 
       The returned handle points to the same LaunchOp operand, consuming it and
@@ -74,10 +74,10 @@
       ```
       gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
                  threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
-        scf.foreach_thread (%i, %j) in (7, 9) {
+        scf.forall (%i, %j) in (7, 9) {
           ... // body 1
         } {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]}
-        scf.foreach_thread (%i) in (12) {
+        scf.forall (%i) in (12) {
           ... // body 2
         } {mapping = [#gpu.thread<x>]}
         gpu.terminator
@@ -125,21 +125,21 @@
      TransformOpInterface,
      TransformEachOpTrait]> {
   let description = [{
-    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
+    Target the gpu_launch op and rewrite the top level `scf.forall`
     to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
     is set, then first generates `gpu_launch` and moves the top level
-    `scf.foreach_thread` inside.
+    `scf.forall` inside.
 
-    The operation searches top level `scf.foreach_thread` ops under
+    The operation searches top level `scf.forall` ops under
     `gpu_launch` and maps each such op to GPU blocks. Mapping is
-    one-to-one and the induction variables of `scf.foreach_thread` are
+    one-to-one and the induction variables of `scf.forall` are
     rewritten to gpu.block_id according to the `thread_dim_mapping` attribute.
 
-    Dynamic, `scf.foreach_thread` trip counts are currently not supported.
+    Dynamic, `scf.forall` trip counts are currently not supported.
     Dynamic block dim sizes are currently not supported.
 
-    Only **bufferized** scf.foreach_thread are currently supported.
-    Only scf.foreach_thread distributed to **at most 3 dimensions** are
+    Only **bufferized** scf.forall are currently supported.
+    Only scf.forall distributed to **at most 3 dimensions** are
     currently supported.
 
     The operation alters the block size of the given gpu_launch using
@@ -149,10 +149,10 @@
 
     This operation ignores non-gpu_launch ops and drops them in the return.
 
-    If any scf.foreach_thread with tensors is found, the transform definitely
+    If any scf.forall with tensors is found, the transform definitely
     fails.
 
-    If all the scf.foreach_thread operations contained within the LaunchOp
+    If all the scf.forall operations contained within the LaunchOp
     referred to by the `target` PDLOperation lower to GPU properly, the
     transform succeeds. Otherwise the transform definitely fails.
 
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -78,8 +78,8 @@
 /// Return true if `linalgOp` contains an embedded gemm subcomputation.
 bool containsMostMinorGemm(linalg::LinalgOp linalgOp);
 
-/// Implementation of tiling operations using `scf.foreach_thread`.
-DiagnosedSilenceableFailure tileToForeachThreadOpImpl(
+/// Implementation of tiling operations using `scf.forall`.
+DiagnosedSilenceableFailure tileToForallOpImpl(
     RewriterBase &rewriter, transform::TransformState &state,
     TransformOpInterface transformOp, ArrayRef<Operation *> targets,
     ArrayRef<OpFoldResult> mixedNumThreads,
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1197,7 +1197,7 @@
     ```
   }];
 
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   let arguments = (ins PDL_Operation:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes);
   let results = (outs PDL_Operation:$for_op,
@@ -1225,31 +1225,31 @@
 }
 
 //===----------------------------------------------------------------------===//
-// TileReductionUsingForeachThreadOp
+// TileReductionUsingForallOp
 //===----------------------------------------------------------------------===//
 
-def TileReductionUsingForeachThreadOp :
-  Op<Transform_Dialect, "structured.tile_reduction_using_foreach_thread",
+def TileReductionUsingForallOp :
+  Op<Transform_Dialect, "structured.tile_reduction_using_forall",
        [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
         TransformEachOpTrait, TransformOpInterface]> {
   let description = [{
-    Tile a PartialReductionOpInterface op to a tiled `scf.foreach_thread` doing
+    Tile a PartialReductionOpInterface op to a tiled `scf.forall` doing
     partial reduction.
 
     This transformation tiles the `target` along the reduction dimensions. It
     creates a tensor initialized with the identity value. Then it creates a
-    `scf.foreach_thread` loops with the number threads given by `num_threads`.
+    `scf.forall` loops with the number threads given by `num_threads`.
     The op is tiled op with a size equal to `floordiv(size, num_threads)`.
     All the partial reduction value is are parallel inserted to create a new
     tensor. After the loop a merge operation is created to do a final reduction
     with the partial reductions tensor.
     If an extra `tile_sizes` parameter is passed the tiles are cyclically
-    distributed on the threads of the `scf.foreach_threads` loop.
+    distributed on the threads of the `scf.foralls` loop.
 
     #### Return modes
 
     This 4 returned handles point to:
-      - the parent foreach_thread op,
+      - the parent forall op,
       - the fill op used to initialize the neutral element,
       - the parallel tiled op and
       - the result-combining op.
@@ -1274,7 +1274,7 @@
     ```
       %0 = tensor.empty(%dim_1) : tensor<?x5xf32>
       %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x5xf32>) -> tensor<?x5xf32>
-      %2 = scf.foreach_thread (%arg2) in (%c5) shared_outs(%arg3 = %1) -> (tensor<?x5xf32>) {
+      %2 = scf.forall (%arg2) in (%c5) shared_outs(%arg3 = %1) -> (tensor<?x5xf32>) {
         %4 = affine.min #map(%arg2)[%dim_0]
         %5 = affine.max #map1(%4)
         %extracted_slice = tensor.extract_slice %arg3[0, %arg2] [%dim, 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
@@ -1286,7 +1286,7 @@
           %9 = arith.addf %in, %out : f32
           linalg.yield %9 : f32
         } -> tensor<?xf32>
-        scf.foreach_thread.perform_concurrently {
+        scf.forall.in_parallel {
           tensor.parallel_insert_slice %7 into %arg3[0, %arg2] [%dim, 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
         }
       } {mapping = []}
@@ -1298,12 +1298,12 @@
     ```
   }];
 
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   let arguments = (ins PDL_Operation:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$num_threads,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
-  let results = (outs PDL_Operation:$foreach_thread_op,
+  let results = (outs PDL_Operation:$forall_op,
                       PDL_Operation:$fill_op,
                       PDL_Operation:$split_linalg_op,
                       PDL_Operation:$combining_linalg_op);
@@ -1412,16 +1412,16 @@
 }
 
 //===----------------------------------------------------------------------===//
-// TileToForeachThreadOp
+// TileToForallOp
 //===----------------------------------------------------------------------===//
 
-def TileToForeachThreadOp :
-    Op<Transform_Dialect, "structured.tile_to_foreach_thread_op",
+def TileToForallOp :
+    Op<Transform_Dialect, "structured.tile_to_forall_op",
       [AttrSizedOperandSegments,
        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
        TransformOpInterface]> {
   let description = [{
-    Tile a TilingInterface op to a tiled `scf.foreach_thread`.
+    Tile a TilingInterface op to a tiled `scf.forall`.
 
     Tiling is applied by either specifying `num_threads` or `tile_size`. If
     `num_threads` is specified, then the tile size for each dimension `i` is
@@ -1438,7 +1438,7 @@
     e.g. in the Linalg case).
 
     If non-empty, the `mapping` is added as an attribute to the
-    resulting `scf.foreach_thread`.
+    resulting `scf.forall`.
 
     Note: `tile_sizes` and `num_threads` are variadic. Each tile size/number of
     threads can be an index attribute or a transform handle that is mapped to
@@ -1457,14 +1457,14 @@
     tiled operations, which can all be empty.
 
     These two returned handles point to:
-      - the new scf.foreach_thread op,
+      - the new scf.forall op,
       - the tiled op that implements TilingInterface.
 
     #### Example using `num_threads`
 
     ```
     %0 = pdl_match @match_matmul in %arg1
-    %3:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [10, 20]
+    %3:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20]
     ```
 
     #### Example using `tile_sizes`
@@ -1472,7 +1472,7 @@
     ```
     %0 = pdl_match @match_matmul in %arg1
     %sz = pdl_match @match_size_op in %arg1
-    %3:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [0, %sz, 20]
+    %3:2 = transform.structured.tile_to_forall_op %0 tile_sizes [0, %sz, 20]
     ```
   }];
 
@@ -1484,7 +1484,7 @@
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_num_threads,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_tile_sizes,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
-  let results = (outs PDL_Operation:$foreach_thread_op,
+  let results = (outs PDL_Operation:$forall_op,
                       PDL_Operation:$tiled_op);
 
   let builders = [
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -507,40 +507,40 @@
 computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize,
                             int64_t divisor);
 
-/// Rewrite a TilingInterface `op` to a tiled `scf.foreach_thread`, applying
+/// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying
 /// tiling by `numThreads`.
 /// If non-empty, the `mapping` is added as an attribute to the
-/// resulting `scf.foreach_thread`.
+/// resulting `scf.forall`.
 /// Zero tile sizes indicate that the dimension is not tiled, and can be
 /// thought of as tiling by the full size of data. It is the user's
 /// responsibility to ensure that `numThreads` is a valid tiling specification
 /// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case).
-struct ForeachThreadTilingResult {
+struct ForallTilingResult {
   Operation *tileOp;
   Operation *tiledOp;
 };
-FailureOr<ForeachThreadTilingResult>
-tileToForeachThreadOp(RewriterBase &builder, TilingInterface op,
-                      ArrayRef<OpFoldResult> numThreads,
-                      std::optional<ArrayAttr> mapping);
+FailureOr<ForallTilingResult> tileToForallOp(RewriterBase &builder,
+                                             TilingInterface op,
+                                             ArrayRef<OpFoldResult> numThreads,
+                                             std::optional<ArrayAttr> mapping);
 
-/// Same as `tileToForeachThreadOp`, but calculate the number of threads
+/// Same as `tileToForallOp`, but calculate the number of threads
 /// required using the given tileSizes.
-FailureOr<ForeachThreadTilingResult>
-tileToForeachThreadOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
-                                    ArrayRef<OpFoldResult> tileSizes,
-                                    std::optional<ArrayAttr> mapping);
+FailureOr<ForallTilingResult>
+tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
+                             ArrayRef<OpFoldResult> tileSizes,
+                             std::optional<ArrayAttr> mapping);
 
 /// Transformation information returned after reduction tiling.
-struct ForeachThreadReductionTilingResult {
+struct ForallReductionTilingResult {
   /// The partial reduction tiled op generated.
   Operation *parallelTiledOp;
   /// The final reduction operation merging all the partial reductions.
   Operation *mergeOp;
   /// The op initializing the tensor used for partial reductions.
   Operation *initialOp;
-  /// The `scf.foreach_thread` operation that iterate over the tiles.
-  scf::ForeachThreadOp loops;
+  /// The `scf.forall` operation that iterate over the tiles.
+  scf::ForallOp loops;
 };
 
 /// Method to tile a reduction to parallel iterations computing partial
@@ -556,7 +556,7 @@
 ///
 /// ```mlir
 /// %0 = linalg.fill ... : tensor<7x4xf32>
-/// %1 = scf.foreach_thread (%iv) in (%c4) shared_outs(%arg0 = %0)
+/// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0)
 ///   -> (tensor<7x4xf32>) {
 ///   %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32>
 ///   %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
@@ -567,10 +567,11 @@
 /// %6 = linalg.generic %1 ["parallel", "reduction"]
 ///   : tensor<7x4xf32> -> tensor<7xf32>
 /// ```
-FailureOr<ForeachThreadReductionTilingResult> tileReductionUsingForeachThread(
-    RewriterBase &b, PartialReductionOpInterface op,
-    ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes = {},
-    std::optional<ArrayAttr> mapping = std::nullopt);
+FailureOr<ForallReductionTilingResult>
+tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op,
+                         ArrayRef<OpFoldResult> numThreads,
+                         ArrayRef<OpFoldResult> tileSizes = {},
+                         std::optional<ArrayAttr> mapping = std::nullopt);
 
 /// All indices returned by IndexOp should be invariant with respect to
 /// tiling. Therefore, if an operation is tiled, we have to transform the
diff --git a/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td b/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
--- a/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
@@ -30,7 +30,7 @@
     can be used by the device-specific code generators and the desired regions
     can be connected to the given processing unit.
 
-    Currently, `scf.foreach_thread` uses this interface to express the mapping
+    Currently, `scf.forall` uses this interface to express the mapping
     of the loops it contains to the GPU's parallelism units such as threads and
     thread blocks.
   }];
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCF.h b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
--- a/mlir/include/mlir/Dialect/SCF/IR/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
@@ -52,9 +52,9 @@
 /// value is not an induction variable, then return nullptr.
 ParallelOp getParallelForInductionVarOwner(Value val);
 
-/// Returns the ForeachThreadOp parent of an thread index variable.
+/// Returns the ForallOp parent of an thread index variable.
 /// If the provided value is not a thread index variable, then return nullptr.
-ForeachThreadOp getForeachThreadOpThreadIndexOwner(Value val);
+ForallOp getForallOpThreadIndexOwner(Value val);
 
 /// Return true if ops a and b (or their ancestors) are in mutually exclusive
 /// regions/blocks of an IfOp.
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -348,18 +348,18 @@
 }
 
 //===----------------------------------------------------------------------===//
-// ForeachThreadOp
+// ForallOp
 //===----------------------------------------------------------------------===//
 
-def ForeachThreadOp : SCF_Op<"foreach_thread", [
+def ForallOp : SCF_Op<"forall", [
        AttrSizedOperandSegments,
        AutomaticAllocationScope,
        RecursiveMemoryEffects,
-       SingleBlockImplicitTerminator<"scf::PerformConcurrentlyOp">,
+       SingleBlockImplicitTerminator<"scf::InParallelOp">,
      ]> {
   let summary = "evaluate a block multiple times in parallel";
   let description = [{
-    `scf.foreach_thread` is a target-independent multi-dimensional parallel
+    `scf.forall` is a target-independent multi-dimensional parallel
     region application operation. It has exactly one block that represents the
     parallel body and it takes index operands that specify lower bounds, upper
     bounds and steps.
@@ -389,22 +389,22 @@
     the op is lowered to, or to ignore it when the specification is ill-formed
     or unsupported for a particular target.
 
-    The only allowed terminator is `scf.foreach_thread.perform_concurrently`.
-    `scf.foreach_thread` returns one value per `shared_out` operand. The
-    actions of the `perform_concurrently` terminators specify how to combine the
+    The only allowed terminator is `scf.forall.in_parallel`.
+    `scf.forall` returns one value per `shared_out` operand. The
+    actions of the `in_parallel` terminators specify how to combine the
     partial results of all parallel invocations into a full value, in some
     unspecified order. The "destination" of each such op must be a `shared_out`
-    block argument of the `scf.foreach_thread` op.
+    block argument of the `scf.forall` op.
 
     The actions involved in constructing the return values are further described
     by `tensor.parallel_insert_slice`.
 
-    `scf.foreach_thread` acts as an implicit synchronization point.
+    `scf.forall` acts as an implicit synchronization point.
 
     When the parallel function body has side effects, their order is unspecified
     across threads.
 
-    `scf.foreach_thread` can be printed in two different ways depending on
+    `scf.forall` can be printed in two different ways depending on
     whether the loop is normalized or not. The loop is 'normalized' when all
     lower bounds are equal to zero and steps are equal to one. In that case,
     `lowerBound` and `step` operands will be omitted during printing.
@@ -415,7 +415,7 @@
     //
     // Sequential context.
     //
-    %matmul_and_pointwise:2 = scf.foreach_thread (%thread_id_1, %thread_id_2) in
+    %matmul_and_pointwise:2 = scf.forall (%thread_id_1, %thread_id_2) in
         (%num_threads_1, %numthread_id_2) shared_outs(%o1 = %C, %o2 = %pointwise)
       -> (tensor<?x?xT>, tensor<?xT>) {
       //
@@ -434,11 +434,11 @@
         tensor<?xT> to tensor<?xT>
       %sE = add ins(%spointwise) outs(%sD)
 
-      scf.foreach_thread.perform_concurrently {
-        scf.foreach_thread.parallel_insert_slice %sD into %o1[h((%thread_id_1, %thread_id_2))]:
+      scf.forall.in_parallel {
+        scf.forall.parallel_insert_slice %sD into %o1[h((%thread_id_1, %thread_id_2))]:
           tensor<?x?xT> into tensor<?x?xT>
 
-        scf.foreach_thread.parallel_insert_slice %spointwise into %o2[i((%thread_id_1, %thread_id_2))]:
+        scf.forall.parallel_insert_slice %spointwise into %o2[i((%thread_id_1, %thread_id_2))]:
           tensor<?xT> into tensor<?xT>
       }
     }
@@ -453,7 +453,7 @@
     //
     // Sequential context.
     //
-    %pointwise = scf.foreach_thread (%i, %j) = (0, 0) to (%dim1, %dim2)
+    %pointwise = scf.forall (%i, %j) = (0, 0) to (%dim1, %dim2)
       step (%tileSize1, %tileSize2) shared_outs(%o1 = %out)
       -> (tensor<?x?xT>, tensor<?xT>) {
       //
@@ -468,8 +468,8 @@
 
       %add = map {"arith.addf"} ins(%sA, %sB) outs(%sC)
 
-      scf.foreach_thread.perform_concurrently {
-        scf.foreach_thread.parallel_insert_slice %add into
+      scf.forall.in_parallel {
+        scf.forall.parallel_insert_slice %add into
           %o[%i, %j][%tileSize1, %tileSize2][1, 1]
           : tensor<?x?xT> into tensor<?x?xT>
       }
@@ -486,14 +486,14 @@
     // Sequential context. Here `mapping` is expressed as GPU thread mapping
     // attributes
     //
-    %matmul_and_pointwise:2 = scf.foreach_thread (%thread_id_1, %thread_id_2) in
+    %matmul_and_pointwise:2 = scf.forall (%thread_id_1, %thread_id_2) in
         (%num_threads_1, %numthread_id_2) shared_outs(...)
       -> (tensor<?x?xT>, tensor<?xT>) {
       //
       // Parallel context, each thread with id = **(%thread_id_2, %thread_id_1)**
       // runs its version of the code.
       //
-       scf.foreach_thread.perform_concurrently {
+       scf.forall.in_parallel {
          ...
       }
     } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
@@ -507,9 +507,9 @@
     ```mlir
     %t0 = ...
     %t1 = ...
-    %r = scf.foreach_thread ... shared_outs(%o = t0) -> tensor<?xf32> {
+    %r = scf.forall ... shared_outs(%o = t0) -> tensor<?xf32> {
       // %t0 and %t1 are privatized. %t0 is definitely copied for each thread
-      // because the scf.foreach_thread op's %t0 use bufferizes to a memory
+      // because the scf.forall op's %t0 use bufferizes to a memory
       // write. In the absence of other conflicts, %t1 is copied only if there
       // are uses of %t1 in the body that bufferize to a memory read and to a
       // memory write.
@@ -661,28 +661,28 @@
     static void ensureTerminator(Region & region, OpBuilder & builder,
                                  Location loc);
 
-    PerformConcurrentlyOp getTerminator();
+    InParallelOp getTerminator();
   }];
 }
 
 //===----------------------------------------------------------------------===//
-// PerformConcurrentlyOp
+// InParallelOp
 //===----------------------------------------------------------------------===//
 
-def PerformConcurrentlyOp : SCF_Op<"foreach_thread.perform_concurrently", [
+def InParallelOp : SCF_Op<"forall.in_parallel", [
        Pure,
        Terminator,
        DeclareOpInterfaceMethods<ParallelCombiningOpInterface>,
-       HasParent<"ForeachThreadOp">,
+       HasParent<"ForallOp">,
       ] # GraphRegionNoTerminator.traits> {
-  let summary = "terminates a `foreach_thread` block";
+  let summary = "terminates a `forall` block";
   let description = [{
-    `scf.foreach_thread.perform_concurrently` is a designated terminator for
-    the `scf.foreach_thread` operation.
+    `scf.forall.in_parallel` is a designated terminator for
+    the `scf.forall` operation.
 
     It has a single region with a single block that contains a flat list of ops.
     Each such op participates in the aggregate formation of a single result of
-    the enclosing `scf.foreach_thread`.
+    the enclosing `scf.forall`.
     The result number corresponds to the position of the op in the terminator.
   }];
 
@@ -697,8 +697,8 @@
     OpBuilder<(ins)>,
   ];
 
-  // TODO: Add a `PerformConcurrentlyOpInterface` interface for ops that can
-  // appear inside perform_concurrently.
+  // TODO: Add a `InParallelOpInterface` interface for ops that can
+  // appear inside in_parallel.
   let extraClassDeclaration = [{
     ::llvm::SmallVector<::mlir::BlockArgument> getDests();
     ::llvm::iterator_range<::mlir::Block::iterator> getYieldingOps();
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1325,7 +1325,7 @@
 // ParallelInsertSliceOp
 //===----------------------------------------------------------------------===//
 
-// TODO: Implement PerformConcurrentlyOpInterface.
+// TODO: Implement InParallelOpInterface.
 def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
        AttrSizedOperandSegments,
        OffsetSizeAndStrideOpInterface,
@@ -1441,7 +1441,7 @@
     /// `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
 
-    /// Return the OpResult of the enclosing ForeachThreadOp that is
+    /// Return the OpResult of the enclosing ForallOp that is
     /// corresponding to this ParallelInsertSliceOp.
     OpResult getTiedOpResult();
   }];
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -169,39 +169,38 @@
 //===----------------------------------------------------------------------===//
 
 DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    RewriterBase &rewriter, scf::ForallOp forallOp,
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         blockIdGenerator,
     SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes) {
   // Step 0. Target-specific verifications. There is no good place to anchor
-  // those right now: the ForeachThreadOp is target-independent and the
-  // transform op does not apply to individual ForeachThreadOp.
-  Location loc = foreachThreadOp->getLoc();
+  // those right now: the ForallOp is target-independent and the
+  // transform op does not apply to individual ForallOp.
+  Location loc = forallOp->getLoc();
 
-  if (!foreachThreadOp.isNormalized())
+  if (!forallOp.isNormalized())
     return transformOp.emitSilenceableError()
            << "unsupported non-normalized loops";
-  if (foreachThreadOp.getNumResults() > 0)
+  if (forallOp.getNumResults() > 0)
     return transformOp.emitSilenceableError()
-           << "only bufferized scf.foreach_thread lowers to "
+           << "only bufferized scf.forall lowers to "
               "gpu.block_id";
-  if (foreachThreadOp.getRank() > 3)
+  if (forallOp.getRank() > 3)
     return transformOp.emitSilenceableError()
-           << "scf.foreach_thread with rank > 3 does not lower to "
+           << "scf.forall with rank > 3 does not lower to "
               "gpu.block_id";
-  if (llvm::any_of(foreachThreadOp.getMixedUpperBound(), [](OpFoldResult ofr) {
+  if (llvm::any_of(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
         return !getConstantIntValue(ofr).has_value();
       })) {
     return transformOp.emitSilenceableError()
            << "unsupported dynamic griddim size";
   }
   SmallVector<Attribute> blockMapping =
-      llvm::to_vector(foreachThreadOp.getMapping()->getValue());
+      llvm::to_vector(forallOp.getMapping()->getValue());
 
   // Step 1. Complete the blockMapping to a full mapping (with 1s) if necessary.
-  SmallVector<Value> numBlocks = foreachThreadOp.getUpperBound(rewriter);
+  SmallVector<Value> numBlocks = forallOp.getUpperBound(rewriter);
   // Ensure we have 3 block sizes, one for each id.
   Value one;
   for (auto attr : mappingAttributes) {
@@ -218,68 +217,68 @@
                         DeviceMappingAttrInterface b) -> bool {
     return a.getMappingId() < b.getMappingId();
   };
-  SmallVector<Value> gridDimValues = scf::ForeachThreadOp::getValuesSortedByKey(
-      blockMapping, numBlocks, comparator);
+  SmallVector<Value> gridDimValues =
+      scf::ForallOp::getValuesSortedByKey(blockMapping, numBlocks, comparator);
   for (Value v : gridDimValues)
     gridDims.push_back(v.getDefiningOp<arith::ConstantIndexOp>().value());
 
   // Step 3. Generate the blockIds using the provided generator and map the
   // induction variables to the newly created ops.
   SmallVector<Value> blockOps;
-  blockIdGenerator(rewriter, foreachThreadOp, blockOps);
+  blockIdGenerator(rewriter, forallOp, blockOps);
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
-       llvm::zip(foreachThreadOp.getInductionVars(), blockMapping)) {
+       llvm::zip(forallOp.getInductionVars(), blockMapping)) {
     bvm.map(blockIdx,
             blockOps[static_cast<int64_t>(
                 blockDim.cast<DeviceMappingAttrInterface>().getMappingId())]);
   }
 
-  // Step 4. Move the body of foreachThreadOp.
+  // Step 4. Move the body of forallOp.
   // Erase the terminator first, it will not be used since we are on buffers.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
-  Block *targetBlock = foreachThreadOp->getBlock();
-  Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  rewriter.eraseOp(forallOp.getTerminator());
+  Block *targetBlock = forallOp->getBlock();
+  Block::iterator insertionPoint = Block::iterator(forallOp);
+  Block &sourceBlock = forallOp.getRegion().front();
   targetBlock->getOperations().splice(insertionPoint,
                                       sourceBlock.getOperations());
 
   // Step 5. RAUW thread indices to thread ops.
-  for (Value loopIndex : foreachThreadOp.getInductionVars()) {
+  for (Value loopIndex : forallOp.getInductionVars()) {
     Value blockIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, blockIdx);
   }
 
   // Step 6. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
+  rewriter.eraseOp(forallOp);
 
   return DiagnosedSilenceableFailure::success();
 }
 
-DiagnosedSilenceableFailure mlir::transform::gpu::findTopLevelForeachThreadOp(
-    Operation *target, scf::ForeachThreadOp &topLevelForeachThreadOp,
-    TransformOpInterface transformOp) {
-  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
-    if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
+DiagnosedSilenceableFailure
+mlir::transform::gpu::findTopLevelForallOp(Operation *target,
+                                           scf::ForallOp &topLevelForallOp,
+                                           TransformOpInterface transformOp) {
+  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
+    if (forallOp->getParentOfType<scf::ForallOp>())
       return WalkResult::advance();
-    if (topLevelForeachThreadOp)
+    if (topLevelForallOp)
       // TODO: Handle multiple foreach if there is no dependences between them
       return WalkResult::interrupt();
-    topLevelForeachThreadOp = foreachThreadOp;
+    topLevelForallOp = forallOp;
     return WalkResult::advance();
   });
 
   if (walkResult.wasInterrupted())
     return transformOp.emitSilenceableError()
-           << "could not find a unique topLevel scf.foreach_thread";
+           << "could not find a unique topLevel scf.forall";
   return DiagnosedSilenceableFailure::success();
 }
 
 /// This is a helper that is only used in
-/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects
+/// rewriteTopLevelForallToGpuBlocks. It generates GPU dialects
 /// block_id.
-static void generateGpuBlockIds(RewriterBase &rewriter,
-                                scf::ForeachThreadOp foreachOp,
+static void generateGpuBlockIds(RewriterBase &rewriter, scf::ForallOp foreachOp,
                                 SmallVectorImpl<Value> &blockOps) {
   Location loc = foreachOp->getLoc();
   OpBuilder::InsertionGuard guard(rewriter);
@@ -308,19 +307,18 @@
     return diag;
   }
 
-  scf::ForeachThreadOp topLevelForeachThreadOp;
-  DiagnosedSilenceableFailure diag =
-      mlir::transform::gpu::findTopLevelForeachThreadOp(
-          target, topLevelForeachThreadOp, transformOp);
+  scf::ForallOp topLevelForallOp;
+  DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(
+      target, topLevelForallOp, transformOp);
   if (!diag.succeeded()) {
     diag.attachNote(target->getLoc()) << "when applied to this payload op";
     return diag;
   }
 
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(topLevelForeachThreadOp);
+  rewriter.setInsertionPoint(topLevelForallOp);
 
-  // Generate gpu launch here and move the foreach_thread inside
+  // Generate gpu launch here and move the forall inside
   if (getGenerateGpuLaunch()) {
     DiagnosedSilenceableFailure diag =
         createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
@@ -328,9 +326,9 @@
       return diag;
     }
     rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
-    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
-    rewriter.eraseOp(topLevelForeachThreadOp);
-    topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
+    Operation *newForallOp = rewriter.clone(*topLevelForallOp);
+    rewriter.eraseOp(topLevelForallOp);
+    topLevelForallOp = cast<scf::ForallOp>(newForallOp);
   }
 
   SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
@@ -340,11 +338,11 @@
       GPUBlockMappingAttr::get(getContext(), Blocks::DimZ)};
 
   diag = checkAttributeType(blockMappingAttributes,
-                            topLevelForeachThreadOp.getMapping(), transformOp);
+                            topLevelForallOp.getMapping(), transformOp);
   if (diag.succeeded())
     diag = mlir::transform::gpu::mapForeachToBlocksImpl(
-        rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim,
-        transformOp, blockMappingAttributes);
+        rewriter, topLevelForallOp, generateGpuBlockIds, gridDim, transformOp,
+        blockMappingAttributes);
   if (diag.succeeded()) {
     diag = alterGpuLaunch(rewriter, gpuLaunch,
                           cast<TransformOpInterface>(getOperation()),
@@ -359,51 +357,50 @@
 // MapNestedForeachToThreads
 //===----------------------------------------------------------------------===//
 
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// Searches `scf.forall` ops nested under `target` and maps each such
 /// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_mapping attribute. Sibling `scf.foreach_thread` are supported in
+/// `scf.forall` are rewritten to gpu.thread_id according to the
+/// thread_dim_mapping attribute. Sibling `scf.forall` are supported in
 /// which case, the union of the number of threads is computed and may result
-/// in predication. Dynamic, `scf.foreach_thread` trip counts are currently
+/// in predication. Dynamic, `scf.forall` trip counts are currently
 /// not supported. Dynamic block dim sizes are currently not supported.
-static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+static DiagnosedSilenceableFailure rewriteOneForallToGpuThreads(
+    RewriterBase &rewriter, scf::ForallOp forallOp,
     const SmallVectorImpl<int64_t> &globalBlockDims,
     const SmallVectorImpl<Value> &threadOps, bool syncAfterDistribute,
     std::optional<TransformOpInterface> transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes) {
   // Step 0. Target-specific verifications. There is no good place to anchor
-  // those right now: the ForeachThreadOp is target-independent and the
-  // transform op does not apply to individual ForeachThreadOp.
+  // those right now: the ForallOp is target-independent and the
+  // transform op does not apply to individual ForallOp.
   auto failureHelper =
       [&](const Twine &message) -> DiagnosedSilenceableFailure {
     if (transformOp.has_value()) {
       return transformOp->emitSilenceableError() << message;
     }
-    return emitDefiniteFailure(foreachThreadOp, message);
+    return emitDefiniteFailure(forallOp, message);
   };
-  Location loc = foreachThreadOp->getLoc();
-  if (!foreachThreadOp.isNormalized())
+  Location loc = forallOp->getLoc();
+  if (!forallOp.isNormalized())
     return failureHelper("unsupported non-normalized loops");
-  if (foreachThreadOp.getNumResults() > 0)
-    return failureHelper(
-        "only bufferized scf.foreach_thread lowers to gpu.thread_id");
-  if (foreachThreadOp.getRank() > 3)
+  if (forallOp.getNumResults() > 0)
+    return failureHelper("only bufferized scf.forall lowers to gpu.thread_id");
+  if (forallOp.getRank() > 3)
     return failureHelper(
-        "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
-  if (llvm::any_of(foreachThreadOp.getMixedUpperBound(), [](OpFoldResult ofr) {
+        "scf.forall with rank > 3 does not lower to gpu.thread_id");
+  if (llvm::any_of(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
         return !getConstantIntValue(ofr).has_value();
       })) {
     return failureHelper("unsupported dynamic blockdim size");
   }
-  if (!foreachThreadOp.getMapping().has_value())
+  if (!forallOp.getMapping().has_value())
     return failureHelper("mapping must be present");
   SmallVector<Attribute> threadMapping =
-      llvm::to_vector(foreachThreadOp.getMapping()->getValue());
+      llvm::to_vector(forallOp.getMapping()->getValue());
 
   // Step 1. Complete the threadMapping to a full mapping (with 1s) if
   // necessary.
-  SmallVector<Value> numThreads = foreachThreadOp.getUpperBound(rewriter);
+  SmallVector<Value> numThreads = forallOp.getUpperBound(rewriter);
   // Ensure we have 3 block sizes, one for each id.
   Value one;
   for (auto attr : threadMappingAttributes) {
@@ -420,9 +417,8 @@
                         DeviceMappingAttrInterface b) -> bool {
     return a.getMappingId() < b.getMappingId();
   };
-  SmallVector<Value> blockDimValues =
-      scf::ForeachThreadOp::getValuesSortedByKey(threadMapping, numThreads,
-                                                 comparator);
+  SmallVector<Value> blockDimValues = scf::ForallOp::getValuesSortedByKey(
+      threadMapping, numThreads, comparator);
   SmallVector<int64_t> blockDims =
       llvm::to_vector(llvm::map_range(blockDimValues, [](Value v) {
         return v.getDefiningOp<arith::ConstantIndexOp>().value();
@@ -440,7 +436,7 @@
   }
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
-       llvm::zip(foreachThreadOp.getInductionVars(), threadMapping)) {
+       llvm::zip(forallOp.getInductionVars(), threadMapping)) {
     bvm.map(blockIdx,
             threadOpsUpdated[blockDim.cast<DeviceMappingAttrInterface>()
                                  .getMappingId()]);
@@ -453,7 +449,7 @@
     if (blockDim > globalBlockDim) {
       return failureHelper(
           "The requested GPU threads are fewer than the number of loop trip "
-          "counts. Try to tile scf.foreach_thread before mapping or set "
+          "counts. Try to tile scf.forall before mapping or set "
           "small blockDim.");
     }
     if (blockDim == globalBlockDim)
@@ -466,9 +462,9 @@
                   : tmpPredicate;
   }
 
-  // Step 5. Move the body of foreachThreadOp.
+  // Step 5. Move the body of forallOp.
   // Erase the terminator first, it will not be used.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
+  rewriter.eraseOp(forallOp.getTerminator());
   Block *targetBlock;
   Block::iterator insertionPoint;
   if (predicate) {
@@ -478,16 +474,16 @@
     targetBlock = ifOp.thenBlock();
     insertionPoint = ifOp.thenBlock()->begin();
   } else {
-    // Step 5.b. Otherwise, move inline just before foreachThreadOp.
-    targetBlock = foreachThreadOp->getBlock();
-    insertionPoint = Block::iterator(foreachThreadOp);
+    // Step 5.b. Otherwise, move inline just before forallOp.
+    targetBlock = forallOp->getBlock();
+    insertionPoint = Block::iterator(forallOp);
   }
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  Block &sourceBlock = forallOp.getRegion().front();
   targetBlock->getOperations().splice(insertionPoint,
                                       sourceBlock.getOperations());
 
   // Step 6. RAUW thread indices to thread ops.
-  for (Value loopIndex : foreachThreadOp.getInductionVars()) {
+  for (Value loopIndex : forallOp.getInductionVars()) {
     Value threadIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, threadIdx);
   }
@@ -498,7 +494,7 @@
     rewriter.create<BarrierOp>(loc);
 
   // Step 8. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
+  rewriter.eraseOp(forallOp);
 
   return DiagnosedSilenceableFailure::success();
 }
@@ -506,28 +502,27 @@
 DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForeachToThreadsImpl(
     RewriterBase &rewriter, Operation *target,
     const SmallVectorImpl<int64_t> &blockDim,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         threadIdGenerator,
     bool syncAfterDistribute, std::optional<TransformOpInterface> transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes) {
   DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
-  target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
+  target->walk([&](scf::ForallOp forallOp) {
     // Ignore cases with different attributes.
-    for (Attribute map : foreachThreadOp.getMapping()->getValue()) {
+    for (Attribute map : forallOp.getMapping()->getValue()) {
       if (!llvm::is_contained(threadMappingAttributes, map)) {
         return WalkResult::skip();
       }
     }
-    diag = checkAttributeType(threadMappingAttributes,
-                              foreachThreadOp.getMapping(), transformOp);
+    diag = checkAttributeType(threadMappingAttributes, forallOp.getMapping(),
+                              transformOp);
     if (diag.succeeded()) {
-      rewriter.setInsertionPoint(foreachThreadOp);
+      rewriter.setInsertionPoint(forallOp);
       SmallVector<Value> threadOps;
-      threadIdGenerator(rewriter, foreachThreadOp, threadOps);
-      diag = rewriteOneForeachThreadToGpuThreads(
-          rewriter, foreachThreadOp, blockDim, threadOps, syncAfterDistribute,
-          transformOp, threadMappingAttributes);
+      threadIdGenerator(rewriter, forallOp, threadOps);
+      diag = rewriteOneForallToGpuThreads(rewriter, forallOp, blockDim,
+                                          threadOps, syncAfterDistribute,
+                                          transformOp, threadMappingAttributes);
     }
     return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
   });
@@ -562,16 +557,15 @@
       GPUThreadMappingAttr::get(ctx, Threads::DimX),
       GPUThreadMappingAttr::get(ctx, Threads::DimY),
       GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
-  auto threadIdGenerator = [](RewriterBase &rewriter,
-                              scf::ForeachThreadOp foreachThreadOp,
+  auto threadIdGenerator = [](RewriterBase &rewriter, scf::ForallOp forallOp,
                               SmallVectorImpl<Value> &threadIds) {
     IndexType indexType = rewriter.getIndexType();
-    threadIds.assign({rewriter.create<ThreadIdOp>(foreachThreadOp->getLoc(),
-                                                  indexType, Dimension::x),
-                      rewriter.create<ThreadIdOp>(foreachThreadOp->getLoc(),
-                                                  indexType, Dimension::y),
-                      rewriter.create<ThreadIdOp>(foreachThreadOp->getLoc(),
-                                                  indexType, Dimension::z)});
+    threadIds.assign({rewriter.create<ThreadIdOp>(forallOp->getLoc(), indexType,
+                                                  Dimension::x),
+                      rewriter.create<ThreadIdOp>(forallOp->getLoc(), indexType,
+                                                  Dimension::y),
+                      rewriter.create<ThreadIdOp>(forallOp->getLoc(), indexType,
+                                                  Dimension::z)});
   };
   diag = mlir::transform::gpu::mapNestedForeachToThreadsImpl(
       rewriter, target, blockDim, threadIdGenerator, getSyncAfterDistribute(),
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -482,7 +482,7 @@
   return fusedOp;
 }
 
-/// First, find the first "scf::ForeachThreadOp" user of `producerOp` and ensure
+/// First, find the first "scf::ForallOp" user of `producerOp` and ensure
 /// it is exactly the `containingOp`, otherwise bail.
 /// Then, find the first "extract" user of the tied block argument and tile it
 /// right before its "extract" use. The tiled op is fused under the
@@ -500,15 +500,15 @@
     return nullptr;
   }
 
-  // Search the first use by a "scf::ForeachThreadOp" user.
-  scf::ForeachThreadOp foreachThreadOp;
+  // Search the first use by a "scf::ForallOp" user.
+  scf::ForallOp forallOp;
   auto itProducerUses =
       llvm::find_if(tileableProducer->getUses(), [&](OpOperand &use) {
-        foreachThreadOp = dyn_cast<scf::ForeachThreadOp>(use.getOwner());
-        return foreachThreadOp;
+        forallOp = dyn_cast<scf::ForallOp>(use.getOwner());
+        return forallOp;
       });
   // If it's not from the containing op, return.
-  if (!foreachThreadOp || foreachThreadOp != containingOp) {
+  if (!forallOp || forallOp != containingOp) {
     diag.attachNote(tileableProducer->getLoc())
         << "could not find a use by the containing op: " << *tileableProducer;
     return nullptr;
@@ -519,7 +519,7 @@
   // TODO: Generalize to more extract/insert/parallel_insert triples.
   //   Maybe evolve into an interface.
   OpOperand *pUse = &(*itProducerUses);
-  BlockArgument bbArg = foreachThreadOp.getTiedBlockArgument(pUse);
+  BlockArgument bbArg = forallOp.getTiedBlockArgument(pUse);
 
   // Search the producer slices accessed within the containing operation.
   // TODO: Generalize to more extract/insert/parallel_insert triples, maybe
@@ -2188,7 +2188,7 @@
   // This is future-proof re mixed static-dynamic and setting up the proper
   // operands segment sizes attributes for multiple variadic operands.
   // In the absence of this, horrible bugs ensue.
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   MLIRContext *ctx = builder.getContext();
   auto opTy = pdl::OperationType::get(ctx);
   auto staticTileSizesAttr = builder.getDenseI64ArrayAttr(staticTileSizes);
@@ -2217,10 +2217,10 @@
 }
 
 //===----------------------------------------------------------------------===//
-// TileReductionUsingForeachThreadOp
+// TileReductionUsingForallOp
 //===----------------------------------------------------------------------===//
 
-void transform::TileReductionUsingForeachThreadOp::build(
+void transform::TileReductionUsingForallOp::build(
     OpBuilder &builder, OperationState &result, Value target,
     ArrayRef<int64_t> staticNumThreads, ArrayRef<int64_t> staticTileSizes,
     ArrayAttr mapping) {
@@ -2228,7 +2228,7 @@
   // This is future-proof re mixed static-dynamic and setting up the proper
   // operands segment sizes attributes for multiple variadic operands.
   // In the absence of this, horrible bugs ensue.
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   MLIRContext *ctx = builder.getContext();
   auto opTy = pdl::OperationType::get(ctx);
   auto staticNumThreadsAttr = builder.getDenseI64ArrayAttr(staticNumThreads);
@@ -2241,8 +2241,7 @@
         /*mapping=*/mapping);
 }
 
-DiagnosedSilenceableFailure
-transform::TileReductionUsingForeachThreadOp::applyToOne(
+DiagnosedSilenceableFailure transform::TileReductionUsingForallOp::applyToOne(
     LinalgOp target, transform::ApplyToEachResultList &results,
     transform::TransformState &state) {
   TrivialPatternRewriter rewriter(getContext());
@@ -2251,8 +2250,8 @@
       getAsOpFoldResult(rewriter.getI64ArrayAttr(getNumThreads()));
   SmallVector<OpFoldResult> tileSizes =
       getAsOpFoldResult(rewriter.getI64ArrayAttr(getTileSizes()));
-  FailureOr<linalg::ForeachThreadReductionTilingResult> result =
-      linalg::tileReductionUsingForeachThread(
+  FailureOr<linalg::ForallReductionTilingResult> result =
+      linalg::tileReductionUsingForall(
           rewriter, cast<PartialReductionOpInterface>(target.getOperation()),
           numThreads, tileSizes, getMapping());
 
@@ -2553,15 +2552,14 @@
 }
 
 //===----------------------------------------------------------------------===//
-// TileToForeachThreadOp
+// TileToForallOp
 //===----------------------------------------------------------------------===//
 
-void transform::TileToForeachThreadOp::build(OpBuilder &builder,
-                                             OperationState &result,
-                                             Value target,
-                                             ArrayRef<int64_t> staticTileSizes,
-                                             transform::TileSizesSpec,
-                                             ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<int64_t> staticTileSizes,
+                                      transform::TileSizesSpec,
+                                      ArrayAttr mapping) {
   return build(builder, result,
                /*target=*/target,
                /*mixedTileSizes=*/
@@ -2570,10 +2568,11 @@
                /*mapping=*/mapping);
 }
 
-void transform::TileToForeachThreadOp::build(
-    OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<OpFoldResult> mixedTileSizes, transform::TileSizesSpec,
-    ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<OpFoldResult> mixedTileSizes,
+                                      transform::TileSizesSpec,
+                                      ArrayAttr mapping) {
   SmallVector<int64_t> staticTileSizes;
   SmallVector<Value> dynamicTileSizes;
   dispatchIndexOpFoldResults(mixedTileSizes, dynamicTileSizes, staticTileSizes);
@@ -2595,21 +2594,21 @@
         /*mapping=*/mapping);
 }
 
-void transform::TileToForeachThreadOp::build(OpBuilder &builder,
-                                             OperationState &result,
-                                             Value target,
-                                             ArrayRef<int64_t> staticNumThreads,
-                                             transform::NumThreadsSpec,
-                                             ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<int64_t> staticNumThreads,
+                                      transform::NumThreadsSpec,
+                                      ArrayAttr mapping) {
   return build(builder, result, target,
                getAsOpFoldResult(builder.getI64ArrayAttr(staticNumThreads)),
                NumThreadsSpec(), mapping);
 }
 
-void transform::TileToForeachThreadOp::build(
-    OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<OpFoldResult> mixedNumThreads, transform::NumThreadsSpec,
-    ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<OpFoldResult> mixedNumThreads,
+                                      transform::NumThreadsSpec,
+                                      ArrayAttr mapping) {
   SmallVector<int64_t> staticNumThreads;
   SmallVector<Value> dynamicNumThreads;
   dispatchIndexOpFoldResults(mixedNumThreads, dynamicNumThreads,
@@ -2632,7 +2631,7 @@
         /*mapping=*/mapping);
 }
 
-DiagnosedSilenceableFailure transform::tileToForeachThreadOpImpl(
+DiagnosedSilenceableFailure transform::tileToForallOpImpl(
     RewriterBase &rewriter, transform::TransformState &state,
     TransformOpInterface transformOp, ArrayRef<Operation *> targets,
     ArrayRef<OpFoldResult> mixedNumThreads,
@@ -2652,12 +2651,12 @@
       return diag;
     }
     rewriter.setInsertionPoint(tileableOp);
-    FailureOr<linalg::ForeachThreadTilingResult> tilingResult = failure();
+    FailureOr<linalg::ForallTilingResult> tilingResult = failure();
     if (!mixedNumThreads.empty()) {
-      tilingResult = linalg::tileToForeachThreadOp(rewriter, tileableOp,
-                                                   mixedNumThreads, mapping);
+      tilingResult = linalg::tileToForallOp(rewriter, tileableOp,
+                                            mixedNumThreads, mapping);
     } else {
-      tilingResult = linalg::tileToForeachThreadOpUsingTileSizes(
+      tilingResult = linalg::tileToForallOpUsingTileSizes(
           rewriter, tileableOp, mixedTileSizes, mapping);
     }
 
@@ -2671,9 +2670,9 @@
   return DiagnosedSilenceableFailure::success();
 }
 
-DiagnosedSilenceableFailure transform::TileToForeachThreadOp::apply(
-    transform::TransformResults &transformResults,
-    transform::TransformState &state) {
+DiagnosedSilenceableFailure
+transform::TileToForallOp::apply(transform::TransformResults &transformResults,
+                                 transform::TransformState &state) {
   IRRewriter rewriter(getContext());
   auto transformOp = cast<TransformOpInterface>(getOperation());
   ArrayRef<Operation *> targets = state.getPayloadOps(getTarget());
@@ -2701,20 +2700,20 @@
   if (!status.succeeded())
     return status;
 
-  DiagnosedSilenceableFailure diag = tileToForeachThreadOpImpl(
-      rewriter, state, transformOp, targets, mixedNumThreads, mixedTileSizes,
-      getMapping(), tileOps, tiledOps);
+  DiagnosedSilenceableFailure diag =
+      tileToForallOpImpl(rewriter, state, transformOp, targets, mixedNumThreads,
+                         mixedTileSizes, getMapping(), tileOps, tiledOps);
 
   if (!diag.succeeded())
     return diag;
 
-  transformResults.set(getForeachThreadOp().cast<OpResult>(), tileOps);
+  transformResults.set(getForallOp().cast<OpResult>(), tileOps);
   transformResults.set(getTiledOp().cast<OpResult>(), tiledOps);
 
   return DiagnosedSilenceableFailure::success();
 }
 
-void transform::TileToForeachThreadOp::getEffects(
+void transform::TileToForallOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   consumesHandle(getTarget(), effects);
   onlyReadsHandle(getTileSizes(), effects);
@@ -2725,17 +2724,17 @@
   modifiesPayload(effects);
 }
 
-SmallVector<OpFoldResult> TileToForeachThreadOp::getMixedNumThreads() {
+SmallVector<OpFoldResult> TileToForallOp::getMixedNumThreads() {
   Builder b(getContext());
   return getMixedValues(getStaticNumThreads(), getNumThreads(), b);
 }
 
-SmallVector<OpFoldResult> TileToForeachThreadOp::getMixedTileSizes() {
+SmallVector<OpFoldResult> TileToForallOp::getMixedTileSizes() {
   Builder b(getContext());
   return getMixedValues(getStaticTileSizes(), getTileSizes(), b);
 }
 
-LogicalResult TileToForeachThreadOp::verify() {
+LogicalResult TileToForallOp::verify() {
   int numThreadsSpec = static_cast<int>(!getMixedNumThreads().empty()) +
                        static_cast<int>(getPackedNumThreads() != Value());
   if (numThreadsSpec > 1)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -244,16 +244,16 @@
 /// Fill out the `tiledOffsets` and `tiledSizes` to be used to tile to a given
 /// number of threads.
 static void calculateTileOffsetsAndSizes(
-    RewriterBase &b, Location loc, scf::ForeachThreadOp foreachThreadOp,
+    RewriterBase &b, Location loc, scf::ForallOp forallOp,
     ArrayRef<OpFoldResult> numThreads, SmallVector<Range> loopRanges,
     bool omitTileOffsetBoundsCheck,
     std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
     SmallVector<OpFoldResult> &tiledOffsets,
     SmallVector<OpFoldResult> &tiledSizes) {
   OpBuilder::InsertionGuard g(b);
-  b.setInsertionPointToStart(foreachThreadOp.getBody(0));
+  b.setInsertionPointToStart(forallOp.getBody(0));
 
-  ValueRange threadIds = foreachThreadOp.getInductionVars();
+  ValueRange threadIds = forallOp.getInductionVars();
   SmallVector<OpFoldResult> nonZeroNumThreads =
       llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
         return !isConstantIntValue(ofr, 0);
@@ -314,19 +314,19 @@
   }
 }
 
-/// Rewrite a TilingInterface `op` to a tiled `scf.foreach_thread`. The
+/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The
 /// tiling is specified by the number of tiles/threads `numThreads` and the
 /// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
 /// not specified, then  it is derived from `numThreads` as `ceilDiv(dimSize[i],
 /// numThreads[i])`. If non-empty, the `mapping` is added as an
-/// attribute to the resulting `scf.foreach_thread`. A zero tile sizes indicate
+/// attribute to the resulting `scf.forall`. A zero tile sizes indicate
 /// that the dimension is not tiled, and can be thought of as tiling by the full
 /// size of data.
 /// It is the user's responsibility to ensure that `numThreads` is a valid
 /// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the
 /// Linalg case). If `omitTileOffsetBoundsCheck` is true, then the function will
 /// assume that `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
-static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
+static FailureOr<ForallTilingResult> tileToForallOpImpl(
     RewriterBase &b, TilingInterface op, ArrayRef<OpFoldResult> numThreads,
     std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
     std::optional<ArrayAttr> mapping, bool omitTileOffsetBoundsCheck) {
@@ -356,26 +356,25 @@
 
   Operation *tiledOp = nullptr;
 
-  // 1. Create the ForeachThreadOp. We don't use the lambda body-builder
+  // 1. Create the ForallOp. We don't use the lambda body-builder
   // version because we require the use of RewriterBase in the body, so we
   // manually move the insertion point to the body below.
-  scf::ForeachThreadOp foreachThreadOp = b.create<scf::ForeachThreadOp>(
+  scf::ForallOp forallOp = b.create<scf::ForallOp>(
       loc, getAsOpFoldResult((materializedNonZeroNumThreads)), dest, mapping);
 
-  // 2. Fill out the ForeachThreadOp body.
+  // 2. Fill out the ForallOp body.
   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
-  calculateTileOffsetsAndSizes(b, loc, foreachThreadOp, numThreads, loopRanges,
+  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, loopRanges,
                                omitTileOffsetBoundsCheck, nominalTileSizes,
                                tiledOffsets, tiledSizes);
 
   // 3. Clone the tileable op and update its destination operands to use the
-  // output bbArgs of the ForeachThreadOp.
-  ArrayRef<BlockArgument> destBbArgs =
-      foreachThreadOp.getOutputBlockArguments();
+  // output bbArgs of the ForallOp.
+  ArrayRef<BlockArgument> destBbArgs = forallOp.getOutputBlockArguments();
   {
-    // 3.a. RAII guard, inserting within foreachThreadOp, before terminator.
+    // 3.a. RAII guard, inserting within forallOp, before terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
     Operation *clonedOp = b.clone(*op.getOperation());
     auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
     if (destinationStyleOp) {
@@ -404,7 +403,7 @@
                            tilingInterfaceOp->getResults(), destBbArgs)) {
     // 5.a. Partial subset information is inserted just before the terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
 
     SmallVector<OpFoldResult> resultOffsets, resultSizes;
     if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
@@ -415,27 +414,27 @@
 
     // 5.b. Parallel insertions are inserted at the end of the combining
     // terminator.
-    b.setInsertionPointToEnd(foreachThreadOp.getTerminator().getBody());
+    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
     b.create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
                                             std::get<2>(it), resultOffsets,
                                             resultSizes, strides);
   }
-  return ForeachThreadTilingResult{foreachThreadOp, tiledOp};
+  return ForallTilingResult{forallOp, tiledOp};
 }
 
-FailureOr<ForeachThreadTilingResult>
-linalg::tileToForeachThreadOp(RewriterBase &b, TilingInterface op,
-                              ArrayRef<OpFoldResult> numThreads,
-                              std::optional<ArrayAttr> mapping) {
-  return tileToForeachThreadOpImpl(b, op, numThreads,
-                                   /*nominalTileSizes=*/std::nullopt, mapping,
-                                   /*omitTileOffsetBoundsCheck=*/false);
+FailureOr<ForallTilingResult>
+linalg::tileToForallOp(RewriterBase &b, TilingInterface op,
+                       ArrayRef<OpFoldResult> numThreads,
+                       std::optional<ArrayAttr> mapping) {
+  return tileToForallOpImpl(b, op, numThreads,
+                            /*nominalTileSizes=*/std::nullopt, mapping,
+                            /*omitTileOffsetBoundsCheck=*/false);
 }
 
-FailureOr<ForeachThreadTilingResult>
-linalg::tileToForeachThreadOpUsingTileSizes(RewriterBase &b, TilingInterface op,
-                                            ArrayRef<OpFoldResult> tileSizes,
-                                            std::optional<ArrayAttr> mapping) {
+FailureOr<ForallTilingResult>
+linalg::tileToForallOpUsingTileSizes(RewriterBase &b, TilingInterface op,
+                                     ArrayRef<OpFoldResult> tileSizes,
+                                     std::optional<ArrayAttr> mapping) {
   SmallVector<Range> loopRanges = op.getIterationDomain(b);
   unsigned nLoops = loopRanges.size();
   SmallVector<OpFoldResult> numThreads;
@@ -450,9 +449,9 @@
           b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
     numThreads.push_back(numTiles);
   }
-  return tileToForeachThreadOpImpl(b, op, numThreads,
-                                   /*nominalTileSizes=*/tileSizes, mapping,
-                                   /*omitTileOffsetBoundsCheck=*/true);
+  return tileToForallOpImpl(b, op, numThreads,
+                            /*nominalTileSizes=*/tileSizes, mapping,
+                            /*omitTileOffsetBoundsCheck=*/true);
 }
 
 template <typename LoopTy>
@@ -608,12 +607,10 @@
       res, loops, outermostLoop ? outermostLoop->getResults() : tensorResults};
 }
 
-FailureOr<linalg::ForeachThreadReductionTilingResult>
-linalg::tileReductionUsingForeachThread(RewriterBase &b,
-                                        PartialReductionOpInterface op,
-                                        ArrayRef<OpFoldResult> numThreads,
-                                        ArrayRef<OpFoldResult> tileSizes,
-                                        std::optional<ArrayAttr> mapping) {
+FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
+    RewriterBase &b, PartialReductionOpInterface op,
+    ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes,
+    std::optional<ArrayAttr> mapping) {
   Location loc = op.getLoc();
   OpBuilder::InsertionGuard g(b);
 
@@ -679,28 +676,27 @@
   SmallVector<Value> materializedNonZeroNumThreads =
       getAsValues(b, loc, nonZeroNumThreads);
 
-  // 2. Create the ForeachThreadOp with an empty region.
-  scf::ForeachThreadOp foreachThreadOp = b.create<scf::ForeachThreadOp>(
+  // 2. Create the ForallOp with an empty region.
+  scf::ForallOp forallOp = b.create<scf::ForallOp>(
       loc, getAsOpFoldResult(materializedNonZeroNumThreads),
       (*identityTensor)->getResults(), mapping);
 
   // 3. Calculate the tile offsets and sizes for the subsequent loop that will
-  // be nested under `foreachThreadOp`.
+  // be nested under `forallOp`.
   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
-  calculateTileOffsetsAndSizes(
-      b, loc, foreachThreadOp, numThreads, iterationDomain,
-      /*omitTileOffsetBoundsCheck =*/false,
-      /*nominalTileSizes=*/std::nullopt, tiledOffsets, tiledSizes);
+  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, iterationDomain,
+                               /*omitTileOffsetBoundsCheck =*/false,
+                               /*nominalTileSizes=*/std::nullopt, tiledOffsets,
+                               tiledSizes);
 
   // 4. Clone the tileable op and update its destination operands to use the
-  // output bbArgs of the ForeachThreadOp.
+  // output bbArgs of the ForallOp.
   ValueRange tilingResults;
-  ArrayRef<BlockArgument> destBbArgs =
-      foreachThreadOp.getOutputBlockArguments();
+  ArrayRef<BlockArgument> destBbArgs = forallOp.getOutputBlockArguments();
   {
-    // 4.a. RAII guard, inserting within foreachThreadOp, before terminator.
+    // 4.a. RAII guard, inserting within forallOp, before terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
 
     SmallVector<Value> tiledDpsInitOperands;
     for (OpOperand *initOperand : destinationStyleOp.getDpsInitOperands()) {
@@ -712,7 +708,7 @@
                                            b.getIndexAttr(0));
       SmallVector<OpFoldResult> sizes = tiledSizes;
       sizes[reductionDim] = b.getIndexAttr(1);
-      outOffsets[reductionDim] = foreachThreadOp.getInductionVars().front();
+      outOffsets[reductionDim] = forallOp.getInductionVars().front();
       // TODO: use SubsetExtractOpInterface once it is available.
       tiledDpsInitOperands.push_back(b.create<tensor::ExtractSliceOp>(
           loc, initOperand->get().getType().cast<RankedTensorType>(),
@@ -746,7 +742,7 @@
       if (failed(maybeTiled))
         return b.notifyMatchFailure(op, "failed tileLinalgOpImpl");
 
-      SmallVector<Value> ids = foreachThreadOp.getInductionVars();
+      SmallVector<Value> ids = forallOp.getInductionVars();
       mapLoopToProcessorIds(cast<scf::ForOp>(maybeTiled->loops.back()), ids,
                             materializedNonZeroNumThreads);
       assert(maybeTiled->loops.size() == 1 &&
@@ -763,7 +759,7 @@
            llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
     // 6.a. Partial subset information is inserted just before the terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
 
     SmallVector<OpFoldResult> resultOffsets, resultSizes;
     if (failed(tilingInterfaceOp.getResultTilePosition(
@@ -774,7 +770,7 @@
     int64_t sizeIdx = 0;
     for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
       if (i == reductionDim) {
-        resultOffsetsRank.push_back(foreachThreadOp.getInductionVars().front());
+        resultOffsetsRank.push_back(forallOp.getInductionVars().front());
         resultSizesRank.push_back(b.getIndexAttr(1));
         continue;
       }
@@ -786,21 +782,21 @@
 
     // 6.b. Parallel insertions are inserted at the end of the combining
     // terminator.
-    b.setInsertionPointToEnd(foreachThreadOp.getTerminator().getBody());
+    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
     b.create<tensor::ParallelInsertSliceOp>(
         loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
   }
 
   // 7. Merge the partial reductions.
-  b.setInsertionPointAfter(foreachThreadOp);
+  b.setInsertionPointAfter(forallOp);
   Operation *mergeOp =
-      op.mergeReductions(b, loc, foreachThreadOp->getResults(), reductionDim);
+      op.mergeReductions(b, loc, forallOp->getResults(), reductionDim);
   b.replaceOp(op, mergeOp->getResults());
 
   // 8. Return.
-  ForeachThreadReductionTilingResult results;
+  ForallReductionTilingResult results;
   results.initialOp = *identityTensor;
-  results.loops = foreachThreadOp;
+  results.loops = forallOp;
   results.parallelTiledOp = tiledOp;
   results.mergeOp = mergeOp;
   return results;
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1106,10 +1106,10 @@
 }
 
 //===----------------------------------------------------------------------===//
-// ForeachThreadOp
+// ForallOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult ForeachThreadOp::verify() {
+LogicalResult ForallOp::verify() {
   unsigned numLoops = getRank();
   // Check number of outputs.
   if (getNumResults() != getOutputs().size())
@@ -1156,7 +1156,7 @@
   return success();
 }
 
-void ForeachThreadOp::print(OpAsmPrinter &p) {
+void ForallOp::print(OpAsmPrinter &p) {
   Operation *op = getOperation();
   p << " (" << getInductionVars();
   if (isNormalized()) {
@@ -1187,8 +1187,7 @@
                                            getStaticStepAttrName()});
 }
 
-ParseResult ForeachThreadOp::parse(OpAsmParser &parser,
-                                   OperationState &result) {
+ParseResult ForallOp::parse(OpAsmParser &parser, OperationState &result) {
   OpBuilder b(parser.getContext());
   auto indexType = b.getIndexType();
 
@@ -1267,7 +1266,7 @@
     return failure();
 
   // Ensure terminator and move region.
-  ForeachThreadOp::ensureTerminator(*region, b, result.location);
+  ForallOp::ensureTerminator(*region, b, result.location);
   result.addRegion(std::move(region));
 
   // Parse the optional attribute list.
@@ -1287,7 +1286,7 @@
 }
 
 // Builder that takes loop bounds.
-void ForeachThreadOp::build(
+void ForallOp::build(
     mlir::OpBuilder &b, mlir::OperationState &result,
     ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
     ArrayRef<OpFoldResult> steps, ValueRange outputs,
@@ -1318,7 +1317,7 @@
                               static_cast<int32_t>(dynamicSteps.size()),
                               static_cast<int32_t>(outputs.size())}));
   if (mapping.has_value()) {
-    result.addAttribute(ForeachThreadOp::getMappingAttrName(result.name),
+    result.addAttribute(ForallOp::getMappingAttrName(result.name),
                         mapping.value());
   }
 
@@ -1337,20 +1336,19 @@
 
   b.setInsertionPointToStart(&bodyBlock);
   if (!bodyBuilderFn) {
-    ForeachThreadOp::ensureTerminator(*bodyRegion, b, result.location);
+    ForallOp::ensureTerminator(*bodyRegion, b, result.location);
     return;
   }
   bodyBuilderFn(b, result.location, bodyBlock.getArguments());
 #ifndef NDEBUG
-  auto terminator =
-      llvm::dyn_cast<PerformConcurrentlyOp>(bodyBlock.getTerminator());
+  auto terminator = llvm::dyn_cast<InParallelOp>(bodyBlock.getTerminator());
   assert(terminator &&
-         "expected bodyBuilderFn to create PerformConcurrentlyOp terminator");
+         "expected bodyBuilderFn to create InParallelOp terminator");
 #endif // NDEBUG
 }
 
 // Builder that takes loop bounds.
-void ForeachThreadOp::build(
+void ForallOp::build(
     mlir::OpBuilder &b, mlir::OperationState &result,
     ArrayRef<OpFoldResult> ubs, ValueRange outputs,
     std::optional<ArrayAttr> mapping,
@@ -1362,7 +1360,7 @@
 }
 
 // Checks if the lbs are zeros and steps are ones.
-bool ForeachThreadOp::isNormalized() {
+bool ForallOp::isNormalized() {
   auto allEqual = [](ArrayRef<OpFoldResult> results, int64_t val) {
     return llvm::all_of(results, [&](OpFoldResult ofr) {
       auto intValue = getConstantIntValue(ofr);
@@ -1375,22 +1373,22 @@
 // The ensureTerminator method generated by SingleBlockImplicitTerminator is
 // unaware of the fact that our terminator also needs a region to be
 // well-formed. We override it here to ensure that we do the right thing.
-void ForeachThreadOp::ensureTerminator(Region &region, OpBuilder &builder,
-                                       Location loc) {
-  OpTrait::SingleBlockImplicitTerminator<PerformConcurrentlyOp>::Impl<
-      ForeachThreadOp>::ensureTerminator(region, builder, loc);
+void ForallOp::ensureTerminator(Region &region, OpBuilder &builder,
+                                Location loc) {
+  OpTrait::SingleBlockImplicitTerminator<InParallelOp>::Impl<
+      ForallOp>::ensureTerminator(region, builder, loc);
   auto terminator =
-      llvm::dyn_cast<PerformConcurrentlyOp>(region.front().getTerminator());
+      llvm::dyn_cast<InParallelOp>(region.front().getTerminator());
   if (terminator.getRegion().empty())
     builder.createBlock(&terminator.getRegion());
 }
 
-PerformConcurrentlyOp ForeachThreadOp::getTerminator() {
-  return cast<PerformConcurrentlyOp>(getBody()->getTerminator());
+InParallelOp ForallOp::getTerminator() {
+  return cast<InParallelOp>(getBody()->getTerminator());
 }
 
 /// Helper to sort `values` according to matching `keys`.
-SmallVector<Value> ForeachThreadOp::getValuesSortedByKey(
+SmallVector<Value> ForallOp::getValuesSortedByKey(
     ArrayRef<Attribute> keys, ValueRange values,
     llvm::function_ref<bool(Attribute, Attribute)> compare) {
   if (keys.empty())
@@ -1406,28 +1404,27 @@
   return res;
 }
 
-ForeachThreadOp mlir::scf::getForeachThreadOpThreadIndexOwner(Value val) {
+ForallOp mlir::scf::getForallOpThreadIndexOwner(Value val) {
   auto tidxArg = val.dyn_cast<BlockArgument>();
   if (!tidxArg)
-    return ForeachThreadOp();
+    return ForallOp();
   assert(tidxArg.getOwner() && "unlinked block argument");
   auto *containingOp = tidxArg.getOwner()->getParentOp();
-  return dyn_cast<ForeachThreadOp>(containingOp);
+  return dyn_cast<ForallOp>(containingOp);
 }
 
 namespace {
-/// Fold tensor.dim(foreach_thread shared_outs(... = %t)) to tensor.dim(%t).
-struct DimOfForeachThreadOp : public OpRewritePattern<tensor::DimOp> {
+/// Fold tensor.dim(forall shared_outs(... = %t)) to tensor.dim(%t).
+struct DimOfForallOp : public OpRewritePattern<tensor::DimOp> {
   using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(tensor::DimOp dimOp,
                                 PatternRewriter &rewriter) const final {
-    auto foreachThreadOp = dimOp.getSource().getDefiningOp<ForeachThreadOp>();
-    if (!foreachThreadOp)
+    auto forallOp = dimOp.getSource().getDefiningOp<ForallOp>();
+    if (!forallOp)
       return failure();
     Value sharedOut =
-        foreachThreadOp.getTiedOpOperand(dimOp.getSource().cast<OpResult>())
-            ->get();
+        forallOp.getTiedOpOperand(dimOp.getSource().cast<OpResult>())->get();
     rewriter.updateRootInPlace(
         dimOp, [&]() { dimOp.getSourceMutable().assign(sharedOut); });
     return success();
@@ -1435,29 +1432,29 @@
 };
 } // namespace
 
-void ForeachThreadOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                  MLIRContext *context) {
-  results.add<DimOfForeachThreadOp>(context);
+void ForallOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                           MLIRContext *context) {
+  results.add<DimOfForallOp>(context);
 }
 
 //===----------------------------------------------------------------------===//
-// PerformConcurrentlyOp
+// InParallelOp
 //===----------------------------------------------------------------------===//
 
-// Build a PerformConcurrentlyOp with mixed static and dynamic entries.
-void PerformConcurrentlyOp::build(OpBuilder &b, OperationState &result) {
+// Build a InParallelOp with mixed static and dynamic entries.
+void InParallelOp::build(OpBuilder &b, OperationState &result) {
   OpBuilder::InsertionGuard g(b);
   Region *bodyRegion = result.addRegion();
   b.createBlock(bodyRegion);
 }
 
-LogicalResult PerformConcurrentlyOp::verify() {
-  scf::ForeachThreadOp foreachThreadOp =
-      dyn_cast<scf::ForeachThreadOp>(getOperation()->getParentOp());
-  if (!foreachThreadOp)
-    return this->emitOpError("expected foreach_thread op parent");
+LogicalResult InParallelOp::verify() {
+  scf::ForallOp forallOp =
+      dyn_cast<scf::ForallOp>(getOperation()->getParentOp());
+  if (!forallOp)
+    return this->emitOpError("expected forall op parent");
 
-  // TODO: PerformConcurrentlyOpInterface.
+  // TODO: InParallelOpInterface.
   for (Operation &op : getRegion().front().getOperations()) {
     if (!isa<tensor::ParallelInsertSliceOp>(op)) {
       return this->emitOpError("expected only ")
@@ -1466,14 +1463,14 @@
 
     // Verify that inserts are into out block arguments.
     Value dest = cast<tensor::ParallelInsertSliceOp>(op).getDest();
-    ArrayRef<BlockArgument> regionOutArgs = foreachThreadOp.getRegionOutArgs();
+    ArrayRef<BlockArgument> regionOutArgs = forallOp.getRegionOutArgs();
     if (!llvm::is_contained(regionOutArgs, dest))
       return op.emitOpError("may only insert into an output block argument");
   }
   return success();
 }
 
-void PerformConcurrentlyOp::print(OpAsmPrinter &p) {
+void InParallelOp::print(OpAsmPrinter &p) {
   p << " ";
   p.printRegion(getRegion(),
                 /*printEntryBlockArgs=*/false,
@@ -1481,8 +1478,7 @@
   p.printOptionalAttrDict(getOperation()->getAttrs());
 }
 
-ParseResult PerformConcurrentlyOp::parse(OpAsmParser &parser,
-                                         OperationState &result) {
+ParseResult InParallelOp::parse(OpAsmParser &parser, OperationState &result) {
   auto &builder = parser.getBuilder();
 
   SmallVector<OpAsmParser::Argument, 8> regionOperands;
@@ -1500,11 +1496,11 @@
   return success();
 }
 
-OpResult PerformConcurrentlyOp::getParentResult(int64_t idx) {
+OpResult InParallelOp::getParentResult(int64_t idx) {
   return getOperation()->getParentOp()->getResult(idx);
 }
 
-SmallVector<BlockArgument> PerformConcurrentlyOp::getDests() {
+SmallVector<BlockArgument> InParallelOp::getDests() {
   return llvm::to_vector<4>(
       llvm::map_range(getYieldingOps(), [](Operation &op) {
         // Add new ops here as needed.
@@ -1513,7 +1509,7 @@
       }));
 }
 
-llvm::iterator_range<Block::iterator> PerformConcurrentlyOp::getYieldingOps() {
+llvm::iterator_range<Block::iterator> InParallelOp::getYieldingOps() {
   return getRegion().front().getOperations();
 }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1003,9 +1003,9 @@
 };
 
 /// Return `true` if the given loop may have 0 iterations.
-bool mayHaveZeroIterations(scf::ForeachThreadOp foreachThreadOp) {
-  for (auto [lb, ub] : llvm::zip(foreachThreadOp.getMixedLowerBound(),
-                                 foreachThreadOp.getMixedUpperBound())) {
+bool mayHaveZeroIterations(scf::ForallOp forallOp) {
+  for (auto [lb, ub] : llvm::zip(forallOp.getMixedLowerBound(),
+                                 forallOp.getMixedUpperBound())) {
     std::optional<int64_t> lbConst = getConstantIntValue(lb);
     std::optional<int64_t> ubConst = getConstantIntValue(ub);
     if (!lbConst.has_value() || !ubConst.has_value() || *lbConst >= *ubConst)
@@ -1014,39 +1014,39 @@
   return false;
 }
 
-/// Bufferization of ForeachThreadOp. This also bufferizes the terminator of the
-/// region. There are op interfaces for the terminators (PerformConcurrentlyOp
+/// Bufferization of ForallOp. This also bufferizes the terminator of the
+/// region. There are op interfaces for the terminators (InParallelOp
 /// and ParallelInsertSliceOp), but these are only used during analysis. Not
 /// for bufferization.
-struct ForeachThreadOpInterface
-    : public BufferizableOpInterface::ExternalModel<ForeachThreadOpInterface,
-                                                    ForeachThreadOp> {
+struct ForallOpInterface
+    : public BufferizableOpInterface::ExternalModel<ForallOpInterface,
+                                                    ForallOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    auto forallOp = cast<ForallOp>(op);
 
     // If the loop has zero iterations, the results of the op are their
     // corresponding shared_outs, meaning that the shared_outs bufferize to a
     // read.
-    if (mayHaveZeroIterations(foreachThreadOp))
+    if (mayHaveZeroIterations(forallOp))
       return true;
 
-    // scf::ForeachThreadOp alone doesn't bufferize to a memory read, one of the
+    // scf::ForallOp alone doesn't bufferize to a memory read, one of the
     // uses of its matching bbArg may.
-    return state.isValueRead(foreachThreadOp.getTiedBlockArgument(&opOperand));
+    return state.isValueRead(forallOp.getTiedBlockArgument(&opOperand));
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
-    // Outputs of scf::ForeachThreadOps are always considered as a write.
+    // Outputs of scf::ForallOps are always considered as a write.
     return true;
   }
 
   AliasingOpResultList getAliasingOpResults(Operation *op, OpOperand &opOperand,
                                             const AnalysisState &state) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
-    return {{{foreachThreadOp.getTiedOpResult(&opOperand),
-              BufferRelation::Equivalent}}};
+    auto forallOp = cast<ForallOp>(op);
+    return {
+        {{forallOp.getTiedOpResult(&opOperand), BufferRelation::Equivalent}}};
   }
 
   bool isWritable(Operation *op, Value value,
@@ -1057,12 +1057,12 @@
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     OpBuilder::InsertionGuard guard(rewriter);
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
-    int64_t rank = foreachThreadOp.getRank();
+    auto forallOp = cast<ForallOp>(op);
+    int64_t rank = forallOp.getRank();
 
     // Get buffers for all output operands.
     SmallVector<Value> buffers;
-    for (Value out : foreachThreadOp.getOutputs()) {
+    for (Value out : forallOp.getOutputs()) {
       FailureOr<Value> buffer = getBuffer(rewriter, out, options);
       if (failed(buffer))
         return failure();
@@ -1070,36 +1070,34 @@
     }
 
     // Use buffers instead of block arguments.
-    rewriter.setInsertionPointToStart(foreachThreadOp.getBody());
-    for (const auto &it :
-         llvm::zip(foreachThreadOp.getBody()->getArguments().drop_front(rank),
-                   buffers)) {
+    rewriter.setInsertionPointToStart(forallOp.getBody());
+    for (const auto &it : llvm::zip(
+             forallOp.getBody()->getArguments().drop_front(rank), buffers)) {
       BlockArgument bbArg = std::get<0>(it);
       Value buffer = std::get<1>(it);
       Value bufferAsTensor =
-          rewriter.create<ToTensorOp>(foreachThreadOp.getLoc(), buffer);
+          rewriter.create<ToTensorOp>(forallOp.getLoc(), buffer);
       bbArg.replaceAllUsesWith(bufferAsTensor);
     }
 
-    // Create new ForeachThreadOp without any results and drop the automatically
+    // Create new ForallOp without any results and drop the automatically
     // introduced terminator.
-    rewriter.setInsertionPoint(foreachThreadOp);
-    ForeachThreadOp newForeachThreadOp;
-    newForeachThreadOp = rewriter.create<ForeachThreadOp>(
-        foreachThreadOp.getLoc(), foreachThreadOp.getMixedLowerBound(),
-        foreachThreadOp.getMixedUpperBound(), foreachThreadOp.getMixedStep(),
-        /*outputs=*/ValueRange(), foreachThreadOp.getMapping());
+    rewriter.setInsertionPoint(forallOp);
+    ForallOp newForallOp;
+    newForallOp = rewriter.create<ForallOp>(
+        forallOp.getLoc(), forallOp.getMixedLowerBound(),
+        forallOp.getMixedUpperBound(), forallOp.getMixedStep(),
+        /*outputs=*/ValueRange(), forallOp.getMapping());
 
-    newForeachThreadOp.getBody()->getTerminator()->erase();
+    newForallOp.getBody()->getTerminator()->erase();
 
     // Move over block contents of the old op.
     SmallVector<Value> replacementBbArgs;
-    replacementBbArgs.append(
-        newForeachThreadOp.getBody()->getArguments().begin(),
-        newForeachThreadOp.getBody()->getArguments().end());
-    replacementBbArgs.append(foreachThreadOp.getOutputs().size(), Value());
-    rewriter.mergeBlocks(foreachThreadOp.getBody(),
-                         newForeachThreadOp.getBody(), replacementBbArgs);
+    replacementBbArgs.append(newForallOp.getBody()->getArguments().begin(),
+                             newForallOp.getBody()->getArguments().end());
+    replacementBbArgs.append(forallOp.getOutputs().size(), Value());
+    rewriter.mergeBlocks(forallOp.getBody(), newForallOp.getBody(),
+                         replacementBbArgs);
 
     // Remove the old op and replace all of its uses.
     replaceOpWithBufferizedValues(rewriter, op, buffers);
@@ -1110,29 +1108,29 @@
   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    auto forallOp = cast<ForallOp>(op);
 
     if (auto bbArg = value.dyn_cast<BlockArgument>())
       // A tensor block argument has the same bufferized type as the
       // corresponding output operand.
       return bufferization::getBufferType(
-          foreachThreadOp.getTiedOpOperand(bbArg)->get(), options, fixedTypes);
+          forallOp.getTiedOpOperand(bbArg)->get(), options, fixedTypes);
 
     // The bufferized result type is the same as the bufferized type of the
     // corresponding output operand.
     return bufferization::getBufferType(
-        foreachThreadOp.getOutputs()[value.cast<OpResult>().getResultNumber()],
+        forallOp.getOutputs()[value.cast<OpResult>().getResultNumber()],
         options, fixedTypes);
   }
 
   bool isRepetitiveRegion(Operation *op, unsigned index) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    auto forallOp = cast<ForallOp>(op);
 
     // This op is repetitive if it has 1 or more steps.
     // If the control variables are dynamic, it is also considered so.
-    for (auto [lb, ub, step] : llvm::zip(foreachThreadOp.getMixedLowerBound(),
-                                         foreachThreadOp.getMixedUpperBound(),
-                                         foreachThreadOp.getMixedStep())) {
+    for (auto [lb, ub, step] :
+         llvm::zip(forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(),
+                   forallOp.getMixedStep())) {
       std::optional<int64_t> lbConstant = getConstantIntValue(lb);
       if (!lbConstant)
         return true;
@@ -1152,10 +1150,10 @@
   }
 };
 
-/// Nothing to do for PerformConcurrentlyOp.
-struct PerformConcurrentlyOpInterface
-    : public BufferizableOpInterface::ExternalModel<
-          PerformConcurrentlyOpInterface, PerformConcurrentlyOp> {
+/// Nothing to do for InParallelOp.
+struct InParallelOpInterface
+    : public BufferizableOpInterface::ExternalModel<InParallelOpInterface,
+                                                    InParallelOp> {
   LogicalResult bufferize(Operation *op, RewriterBase &b,
                           const BufferizationOptions &options) const {
     llvm_unreachable("op does not have any tensor OpOperands / OpResults");
@@ -1174,9 +1172,8 @@
     ExecuteRegionOp::attachInterface<ExecuteRegionOpInterface>(*ctx);
     ForOp::attachInterface<ForOpInterface>(*ctx);
     IfOp::attachInterface<IfOpInterface>(*ctx);
-    ForeachThreadOp::attachInterface<ForeachThreadOpInterface>(*ctx);
-    PerformConcurrentlyOp::attachInterface<PerformConcurrentlyOpInterface>(
-        *ctx);
+    ForallOp::attachInterface<ForallOpInterface>(*ctx);
+    InParallelOp::attachInterface<InParallelOpInterface>(*ctx);
     WhileOp::attachInterface<WhileOpInterface>(*ctx);
     YieldOp::attachInterface<YieldOpInterface>(*ctx);
   });
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
@@ -177,13 +177,12 @@
         }
         return failure();
       }
-      if (scf::ForeachThreadOp foreachThreadOp =
-              scf::getForeachThreadOpThreadIndexOwner(iv)) {
-        for (int64_t idx = 0; idx < foreachThreadOp.getRank(); ++idx) {
-          if (foreachThreadOp.getInductionVar(idx) == iv) {
-            lb = foreachThreadOp.getMixedLowerBound()[idx];
-            ub = foreachThreadOp.getMixedUpperBound()[idx];
-            step = foreachThreadOp.getMixedStep()[idx];
+      if (scf::ForallOp forallOp = scf::getForallOpThreadIndexOwner(iv)) {
+        for (int64_t idx = 0; idx < forallOp.getRank(); ++idx) {
+          if (forallOp.getInductionVar(idx) == iv) {
+            lb = forallOp.getMixedLowerBound()[idx];
+            ub = forallOp.getMixedUpperBound()[idx];
+            step = forallOp.getMixedStep()[idx];
             return success();
           }
         }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -151,7 +151,7 @@
   %f0 = arith.constant 0.0: f32
   %c512 = arith.constant 512 : index
 
-  %r1 = scf.foreach_thread (%iv) in (%c512) shared_outs(%o = %t) -> (tensor<?xf32>) {
+  %r1 = scf.forall (%iv) in (%c512) shared_outs(%o = %t) -> (tensor<?xf32>) {
     // tensor.empty itself does not alloc but forwards to the insert_slice.
     // EmptyTensorOpElimination replaces the tensor.empty with an inplace
     // extract_slice.
@@ -162,7 +162,7 @@
     %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>) -> tensor<?xf32>
 
     // Self-copy canonicalizes away later.
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %f into %o[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
     }
   }
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -21,7 +21,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -33,7 +33,7 @@
   %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -62,7 +62,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -74,7 +74,7 @@
   %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -89,7 +89,7 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{The requested GPU threads are fewer than the number of loop trip counts. Try to tile scf.foreach_thread before mapping or set small blockDim.}}
+  // expected-error @below {{The requested GPU threads are fewer than the number of loop trip counts. Try to tile scf.forall before mapping or set small blockDim.}}
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
 }
 
@@ -101,7 +101,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -135,9 +135,9 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  %foreach, %tiled = transform.structured.tile_to_foreach_thread_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
+  %foreach, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{only bufferized scf.foreach_thread lowers to gpu.thread_id}}
+  // expected-error @below {{only bufferized scf.forall lowers to gpu.thread_id}}
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
 }
 
@@ -167,14 +167,14 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
 
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -189,7 +189,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{could not find a unique topLevel scf.foreach_thread}}
+  // expected-error @below {{could not find a unique topLevel scf.forall}}
   %1 = transform.gpu.map_foreach_to_blocks %funcop
 }
 
@@ -202,14 +202,14 @@
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
 
-  scf.foreach_thread (%i, %j) in (%c7, %c65537) {
+  scf.forall (%i, %j) in (%c7, %c65537) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
       memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
   }  { mapping = [#gpu.thread<x>, #gpu.thread<y>] }
 
-  scf.foreach_thread (%i, %j) in (%c7, %c9) {
+  scf.forall (%i, %j) in (%c7, %c9) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
@@ -222,7 +222,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{could not find a unique topLevel scf.foreach_thread}}
+  // expected-error @below {{could not find a unique topLevel scf.forall}}
   %1 = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
 }
 
@@ -231,7 +231,7 @@
 func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
   %one = arith.constant 1 : index
   %c65535 = arith.constant 65535 : index
-  scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+  scf.forall (%i, %j) in (%c65535, %c65535) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
@@ -256,7 +256,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c32, %c32) {
+    scf.forall (%i, %j) in (%c32, %c32) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = arith.mulf %4, %5 : f32
@@ -300,6 +300,6 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{transform.structured.tile_to_foreach_thread_op failed to apply}}
-  %foreach, %tiled = transform.structured.tile_to_foreach_thread_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
+  // expected-error @below {{transform.structured.tile_to_forall_op failed to apply}}
+  %foreach, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
 }
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -19,7 +19,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
@@ -68,13 +68,13 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : !type
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.foreach_thread (%i) in (%c12) {
+     scf.forall (%i) in (%c12) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
@@ -112,8 +112,8 @@
 //      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
 //      CHECK:   memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
 //      CHECK:   memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
-  scf.foreach_thread (%i, %j) in (%c32, %c64) {
-    scf.foreach_thread (%k, %l) in (%c4, %c32) {
+  scf.forall (%i, %j) in (%c32, %c64) {
+    scf.forall (%k, %l) in (%c4, %c32) {
       %4 = memref.load %x[%i, %j, %k, %l] : !type4d
       %5 = memref.load %y[%i, %j, %k, %l] : !type4d
       %6 = math.fma %alpha, %4, %5 : f32
@@ -146,7 +146,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
@@ -178,7 +178,7 @@
 //      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
 //      CHECK:   memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]]
 //      CHECK:   memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]]
-    scf.foreach_thread (%i) in (%c32) {
+    scf.forall (%i) in (%c32) {
         %4 = memref.load %x[%i, %i] : !type
         %5 = memref.load %y[%i, %i] : !type
         %6 = arith.mulf %4, %5 : f32
@@ -211,7 +211,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j, %k) in (%one, %c7, %c9) {
+    scf.forall (%i, %j, %k) in (%one, %c7, %c9) {
 //      CHECK:   memref.load %{{.*}}[%[[C0]],
 //      CHECK:   memref.load %{{.*}}[%[[C0]],
         %4 = memref.load %x[%i, %j, %k] : !type
@@ -248,13 +248,13 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : !type
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.foreach_thread (%i) in (%c12) {
+     scf.forall (%i) in (%c12) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -880,10 +880,10 @@
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<4x2xf32>
-  %res = scf.foreach_thread (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
+  %res = scf.forall (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
     %1 = tensor.empty() : tensor<1x1xf32>
     %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
       // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
       tensor.parallel_insert_slice %2 into %o[%arg0, %arg1] [1, 1] [1, 1] :
diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
--- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
@@ -13,14 +13,14 @@
 //  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
   func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  //      CHECK: scf.foreach_thread ({{.*}}) in (10, 20) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
+  //      CHECK: scf.forall ({{.*}}) in (10, 20) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
   //      CHECK:   %[[tA:.*]] = tensor.extract_slice %[[A]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[tB:.*]] = tensor.extract_slice %[[B]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[RES:.*]] = linalg.matmul
   // CHECK-SAME:      ins(%[[tA]], %[[tB]] : tensor<?x?xf32>, tensor<?x?xf32>)
   // CHECK-SAME:     outs(%[[tC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-  //      CHECK:   scf.foreach_thread.perform_concurrently {
+  //      CHECK:   scf.forall.in_parallel {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[RES]] into %[[C_BLK]]{{.*}} :
   // CHECK-SAME:       tensor<?x?xf32> into tensor<?x?xf32>
   // CHECK-NEXT:   }
@@ -33,7 +33,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
+    %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
   }
 }
 
@@ -58,12 +58,12 @@
   //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size_1]]]
   //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map0]]()[%[[N]], %[[tile_size_2]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %tile_size_1 = "test.dummy"() : () -> (index)
   %tile_size_2 = "test.dummy"() : () -> (index)
@@ -76,7 +76,7 @@
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes %sz
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes %sz
 }
 
 // -----
@@ -93,7 +93,7 @@
 //  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
 func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (10, 21) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 21) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV1]])
   //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
   //  CHECK-NOT:   affine.min
@@ -104,7 +104,7 @@
   //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
   //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
                     outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
@@ -114,7 +114,7 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [10, 21]
+  %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 21]
 }
 
 
@@ -136,7 +136,7 @@
   //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
   //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
   //      CHECK:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
   //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
@@ -145,7 +145,7 @@
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
@@ -155,7 +155,7 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [10, 20]
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes [10, 20]
 }
 
 // -----
@@ -171,7 +171,7 @@
 //  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
 func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
   //  CHECK-NOT:   affine.max
   //  CHECK-NOT:   affine.min
@@ -181,7 +181,7 @@
   //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
   //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
                     outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
@@ -191,7 +191,7 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [10, 21]
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes [10, 21]
 }
 
 // -----
@@ -213,15 +213,15 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [2] ( mapping = [#gpu.thread<x>])
+    %1:2 = transform.structured.tile_to_forall_op %0 num_threads [2] ( mapping = [#gpu.thread<x>])
   }
 }
 // CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 2)>
 
 // CHECK-LABEL: extract_source(
-//       CHECK:  scf.foreach_thread (%[[ARG:.*]]) in (2) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
+//       CHECK:  scf.forall (%[[ARG:.*]]) in (2) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
 //       CHECK:    %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]])
-//       CHECK:    scf.foreach_thread.perform_concurrently {
+//       CHECK:    scf.forall.in_parallel {
 //       CHECK:      tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32>
 
 // -----
@@ -247,12 +247,12 @@
   //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size]]]
   //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %tile_size = "test.dummy"() : () -> (index)
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
@@ -264,7 +264,7 @@
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [%sz, 20]
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes [%sz, 20]
 }
 
 // -----
@@ -282,7 +282,7 @@
   func.func @tile_output_multi_1d_static(%IN1: tensor<100xf32>, %IN2: tensor<100xf32>,
                                          %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
                                          -> (tensor<100xf32>, tensor<100xf32>) {
-//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
+//      CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
 //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]])
 //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
 //  CHECK-NOT:   affine.min
@@ -293,7 +293,7 @@
 //      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
 //      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
 //      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
-//      CHECK:   scf.foreach_thread.perform_concurrently
+//      CHECK:   scf.forall.in_parallel
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
     %res1, %res2 = linalg.generic
@@ -317,7 +317,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %foreach_thread, %tiled_generic = transform.structured.tile_to_foreach_thread_op %0 num_threads [7]
+    %forall, %tiled_generic = transform.structured.tile_to_forall_op %0 num_threads [7]
   }
 
 // -----
@@ -337,14 +337,14 @@
   func.func @tile_output_multi_1d2d_static(%IN1: tensor<100xf32>, %IN2: tensor<100x300xf32>, %IN3: tensor<300xf32>,
                      %OUT1: tensor<300x100xf32>, %OUT2: tensor<300xf32>)
                      -> (tensor<300x100xf32>, tensor<300xf32>) {
-//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (4) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
+//      CHECK: scf.forall (%[[IV0:.+]]) in (4) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
 //      CHECK:   %[[LB:.+]] = affine.apply #[[$map0]](%[[IV0]])
 //      CHECK:   %[[tIN1:.+]] = tensor.extract_slice %[[IN2]][0, %[[LB]]] [100, 75]
 //      CHECK:   %[[tIN2:.+]] = tensor.extract_slice %[[IN3]][%[[LB]]] [75]
 //      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]], 0] [75, 100]
 //      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [75]
 //      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
-//      CHECK:   scf.foreach_thread.perform_concurrently
+//      CHECK:   scf.forall.in_parallel
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]], 0] [75, 100]
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [75]
     %res2, %res3 = linalg.generic {
@@ -370,6 +370,6 @@
   transform.sequence failures(propagate) {
   ^bb1(%IN_MAT2: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %IN_MAT2 : (!pdl.operation) -> !pdl.operation
-    %foreach_thread, %tiled_generic = transform.structured.tile_to_foreach_thread_op %0 num_threads [4]
+    %forall, %tiled_generic = transform.structured.tile_to_forall_op %0 num_threads [4]
   }
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -16,8 +16,8 @@
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} {
+    %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -28,7 +28,7 @@
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
@@ -44,7 +44,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.fill is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -66,8 +66,8 @@
     %0 = tensor.empty(%arg0) : tensor<?xf32>
     %1 = affine.apply #map0()[%arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<64xf32>) {
+    // CHECK: scf.forall {{.*}} {
+    %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<64xf32>) {
       // CHECK: %[[INIT_TENSOR:.*]] = tensor.empty
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%arg0]
@@ -75,7 +75,7 @@
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[INIT_TENSOR]]
       %7 = linalg.elemwise_unary ins(%0 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<64xf32>
       }
     }
@@ -87,7 +87,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["tensor.empty"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // tensor.empty is not tileable. The op is cloned and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -111,8 +111,8 @@
     %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
 
-    // CHECK: scf.foreach_thread {{.*}} -> (tensor<?xf32>) {
-    %2 = scf.foreach_thread (%arg3) in (%d0) shared_outs(%o = %0) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} -> (tensor<?xf32>) {
+    %2 = scf.forall (%arg3) in (%d0) shared_outs(%o = %0) -> (tensor<?xf32>) {
       %5 = tensor.extract_slice %o[%arg3] [1] [1] : tensor<?xf32> to tensor<f32>
       
       // CHECK: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<?xf32> to tensor<1xf32>
@@ -121,7 +121,7 @@
       // CHECK: func.call @foo(%{{.*}}) : (tensor<f32>) -> tensor<f32>
       %7 = func.call @foo(%5) : (tensor<f32>) -> tensor<f32>
 
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
       // CHECK: tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%{{.*}}] [1] [1] : tensor<f32> into tensor<?xf32>
         tensor.parallel_insert_slice %7 into %o[%arg3] [1] [1] : tensor<f32> into tensor<?xf32>
       }
@@ -133,7 +133,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.fill is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -158,8 +158,8 @@
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor<?xf32>) {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %0) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor<?xf32>) {
+    %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %0) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -170,7 +170,7 @@
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary {{.*}} outs(%[[T1]]
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
@@ -181,7 +181,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.fill is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -218,8 +218,8 @@
 
     %1 = affine.apply #map0()[%d0, %idx]
 
-    // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%i) in (%1) shared_outs(%o = %out_2) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} {
+    %2 = scf.forall (%i) in (%1) shared_outs(%o = %out_2) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%i)[%idx]
       %4 = affine.min #map2(%i)[%d0, %idx]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -230,7 +230,7 @@
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]#0
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
@@ -241,7 +241,7 @@
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.generic is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
--- a/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
@@ -4,7 +4,7 @@
 
 module {
   // CHECK: func @foo
-  // CHECK:   scf.foreach_thread {{.*}} {
+  // CHECK:   scf.forall {{.*}} {
   // CHECK:     linalg.fill
   // CHECK:     linalg.matmul
   // CHECK:     linalg.generic
@@ -47,21 +47,21 @@
     %producers = transform.structured.match attributes{"__producer__"} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // Tile the root.
-    %foreach_thread_op, %tiled_op = transform.structured.tile_to_foreach_thread_op %root num_threads [10, 20]
+    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %root num_threads [10, 20]
 
     // Fuse all producers.
-    transform.structured.fuse_into_containing_op %producers into %foreach_thread_op
+    transform.structured.fuse_into_containing_op %producers into %forall_op
   }
 }
 
 // -----
 
-// Inverse the order of the payload ops passed to the tile_to_foreach_thread_op
+// Inverse the order of the payload ops passed to the tile_to_forall_op
 // op. Fusion should still work.
 
 module {
   // CHECK: func @foo
-  // CHECK:   scf.foreach_thread {{.*}} {
+  // CHECK:   scf.forall {{.*}} {
   // CHECK:     linalg.fill
   // CHECK:     linalg.matmul
   // CHECK:     linalg.generic
@@ -105,9 +105,9 @@
     %reversed_producers = transform.test_reverse_payload_ops %producers
 
     // Tile the root.
-    %foreach_thread_op, %tiled_op = transform.structured.tile_to_foreach_thread_op %root num_threads [10, 20]
+    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %root num_threads [10, 20]
 
     // Fuse all producers.
-    transform.structured.fuse_into_containing_op %reversed_producers into %foreach_thread_op
+    transform.structured.fuse_into_containing_op %reversed_producers into %forall_op
   }
 }
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -109,7 +109,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
     by num_threads = [0, 5], tile_sizes = []
 }
 
@@ -127,7 +127,7 @@
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
 // CHECK-DAG:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
@@ -139,7 +139,7 @@
 //     CHECK:       arith.addf
 //     CHECK:       linalg.yield
 //     CHECK:     } -> tensor<?xf32>
-//     CHECK:     scf.foreach_thread.perform_concurrently {
+//     CHECK:     scf.forall.in_parallel {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
@@ -161,7 +161,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
     by num_threads = [0, 0, 5], tile_sizes = []
 }
 
@@ -181,7 +181,7 @@
 // CHECK-DAG:   %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor<?x?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
 // CHECK-DAG:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?x5xf32> to tensor<?x?xf32>
@@ -190,7 +190,7 @@
 //     CHECK:     %[[INCHUNKB:.+]] = tensor.extract_slice %[[ARG1]][%[[TINDEX]], 0] [%[[TS1]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[TEMPEXT:.+]] = tensor.extract_slice %[[ET]][0, 0] [%[[D0]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[PARTIAL:.+]] = linalg.matmul ins(%[[INCHUNKA]], %[[INCHUNKB]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[TEMPEXT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-//     CHECK:     scf.foreach_thread.perform_concurrently {
+//     CHECK:     scf.forall.in_parallel {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
@@ -220,7 +220,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0 
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0 
     by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>]
 }
 
@@ -238,7 +238,7 @@
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
 //     CHECK:     %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:     %[[LB:.+]] = affine.apply #[[MAP0]]()[%[[IV]]]
@@ -255,7 +255,7 @@
 //     CHECK:       %[[INS:.+]] = tensor.insert_slice %[[PARTIAL]] into %[[ACC]][0] [%[[D3]]] [1] : tensor<?xf32> into tensor<?xf32>
 //     CHECK:       scf.yield %[[INS]] : tensor<?xf32>
 //     CHECK:     }
-//     CHECK:     scf.foreach_thread.perform_concurrently {
+//     CHECK:     scf.forall.in_parallel {
 //     CHECK:       tensor.parallel_insert_slice %[[CARRY]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
@@ -285,7 +285,7 @@
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0 
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0 
     by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>]
   
   //      CHECK:     expecting fill
@@ -303,7 +303,7 @@
 
 // -----
 
-func.func @reduction_untiled_foreach_thread(
+func.func @reduction_untiled_forall(
   %arg0: tensor<?x?xf32>, %out: tensor<?xf32>) -> tensor<?xf32> {
   // expected-note @below {{target operation}}
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
@@ -323,7 +323,7 @@
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{could not tile reduction}}
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
     by num_threads = [5], tile_sizes = [3], mapping = [#gpu.thread<x>]
 
 }
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1486,8 +1486,8 @@
   // CHECK: %[[c1:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
 
-  %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
-    scf.foreach_thread.perform_concurrently {
+  %2 = scf.forall (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %arg0 into %o[%tidx, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<?x?xf32>
     }
   }
diff --git a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
--- a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
+++ b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
@@ -8,7 +8,7 @@
   linalg.fill ins(%cst_0 : f32) outs(%0 : memref<128x384xf32>)
   %2 = memref.alloc() : memref<128xf32>
   linalg.fill ins(%cst_0 : f32) outs(%2 : memref<128xf32>)
-  scf.foreach_thread (%arg0) in (%c2) {
+  scf.forall (%arg0) in (%c2) {
     %7 = affine.min affine_map<(d0) -> (d0 * -64 + 128, 64)>(%arg0)
     %8 = affine.max affine_map<(d0) -> (0, d0)>(%7)
     %9 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -548,9 +548,9 @@
   %num_threads = arith.constant 100 : index
 
   // expected-error @+1 {{1 operands present, but expected 2}}
-  %result:2 = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
+  %result:2 = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
@@ -564,9 +564,9 @@
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         // expected-error @+1 {{may only insert into an output block argument}}
         tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
@@ -581,10 +581,10 @@
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
         %0 = arith.constant 1: index
@@ -598,8 +598,8 @@
 func.func @mismatched_mapping(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
   %one = arith.constant 1 : index
   %c65535 = arith.constant 65535 : index
-  // expected-error @below {{'scf.foreach_thread' op mapping attribute size must match op rank}}
-  scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+  // expected-error @below {{'scf.forall' op mapping attribute size must match op rank}}
+  scf.forall (%i, %j) in (%c65535, %c65535) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -614,7 +614,7 @@
   %c0 = arith.constant 0 : index
   %cst = arith.constant -0.000000e+00 : f32
   %c320 = arith.constant 320 : index
-  %4 = scf.foreach_thread (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
+  %4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
     %5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1]  : tensor<320x10240xf32> to tensor<1x10240xf32>
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
@@ -624,7 +624,7 @@
     // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
     %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
 
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
       tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
     }
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
@@ -108,23 +108,23 @@
 
 // -----
 
-// CHECK-LABEL: func @scf_foreach_thread_out_of_place(
+// CHECK-LABEL: func @scf_forall_out_of_place(
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32>
-// CHECK-FUNC-LABEL: func @scf_foreach_thread_out_of_place(
-func.func @scf_foreach_thread_out_of_place(%in: tensor<100xf32>,
+// CHECK-FUNC-LABEL: func @scf_forall_out_of_place(
+func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
                                            %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
   // CHECK-FUNC-NOT: alloc_tensor
   // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() copy(%[[arg1]]) {bufferization.escape = [false]} : tensor<100xf32>
-  // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
+  // CHECK: scf.forall {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
       // CHECK: tensor.extract_slice
-      // CHECK: scf.foreach_thread.perform_concurrently
+      // CHECK: scf.forall.in_parallel
       // CHECK: tensor.parallel_insert_slice %{{.*}} into %[[o]]
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -543,8 +543,8 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
 
-  // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
-  %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+  // CHECK: scf.forall (%[[tidx:.*]]) in (%[[idx2]])
+  %2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       // CHECK: %[[subview:.*]] = memref.subview %[[arg2]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
       // CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview]] : memref<?xf32
@@ -553,9 +553,9 @@
       // CHECK: memref.copy %[[subview]], %[[subview]]
 
       // Empty terminator is elided from pretty-printing.
-      // CHECK-NOT: scf.foreach_thread.perform_concurrently
+      // CHECK-NOT: scf.forall.in_parallel
       // CHECK-NOT: parallel_insert_slice
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
           tensor<?xf32> into tensor<?xf32>
       }
@@ -589,8 +589,8 @@
   // CHECK: %[[alloc1:.*]] = memref.alloc
   // CHECK: memref.copy %[[arg2]], %[[alloc1]]
 
-  // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
-  %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+  // CHECK: scf.forall (%[[tidx:.*]]) in (%[[idx2]])
+  %2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       // CHECK: %[[subview1:.*]] = memref.subview %[[alloc1]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
 
@@ -601,9 +601,9 @@
       // CHECK: memref.copy %[[subview1]], %[[subview1]]
 
       // Empty terminator is elided from pretty-printing.
-      // CHECK-NOT: scf.foreach_thread.perform_concurrently
+      // CHECK-NOT: scf.forall.in_parallel
       // CHECK-NOT: parallel_insert_slice
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
           tensor<?xf32> into tensor<?xf32>
       }
@@ -629,8 +629,8 @@
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
 
-  // CHECK: scf.foreach_thread {{.*}}
-  %0 = scf.foreach_thread (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
+  // CHECK: scf.forall {{.*}}
+  %0 = scf.forall (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
     %1 = affine.apply #map0(%arg3)
     %3 = tensor.extract_slice %arg0[%1, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32>
     %4 = affine.apply #map1(%arg4)
@@ -639,7 +639,7 @@
 
     //      CHECK: linalg.matmul ins({{.*}}memref<4x8xf32, strided<[?, ?], offset: ?>>, memref<8x4xf32, strided<[?, ?], offset: ?>>) outs({{.*}} : memref<4x4xf32, strided<[?, ?], offset: ?>>)
     %8 = linalg.matmul ins(%3, %6 : tensor<4x8xf32>, tensor<8x4xf32>) outs(%7 : tensor<4x4xf32>) -> tensor<4x4xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %8 into %o[%1, %4] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<8x8xf32>
     }
   }
@@ -658,19 +658,19 @@
   // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: memref.copy %[[t]], %[[t_copy]]
 
-  // CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) {
+  // CHECK: scf.forall (%{{.*}}) in (%{{.*}}) {
 
   // Load from the copy and store into the shared output.
   // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
   // CHECK:   memref.load %[[t_copy]]
   // CHECK:   memref.store %{{.*}}, %[[subview]]
-  %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
+  %0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
     %r2 = tensor.extract %t[%tid] : tensor<10xf32>
     %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %i into %o[%offset] [5] [1]
           : tensor<5xf32> into tensor<10xf32>
     }
@@ -691,8 +691,8 @@
 
   // CHECK-NOT: memref.alloc
   // CHECK-NOT: memref.copy
-  // CHECK: scf.foreach_thread {{.*}} {
-  %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
+  // CHECK: scf.forall {{.*}} {
+  %0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
@@ -701,7 +701,7 @@
     // CHECK: memref.load %[[t1]]
     %r2 = tensor.extract %t1[%tid] : tensor<10xf32>
     %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %i into %o[%offset] [5] [1]
           : tensor<5xf32> into tensor<10xf32>
     }
diff --git a/mlir/test/Dialect/SCF/ops.mlir b/mlir/test/Dialect/SCF/ops.mlir
--- a/mlir/test/Dialect/SCF/ops.mlir
+++ b/mlir/test/Dialect/SCF/ops.mlir
@@ -311,21 +311,21 @@
   return %res : i64
 }
 
-// CHECK-LABEL: func.func @normalized_foreach_thread
-func.func @normalized_foreach_thread(%in: tensor<100xf32>, %out: tensor<100xf32>) {
+// CHECK-LABEL: func.func @normalized_forall
+func.func @normalized_forall(%in: tensor<100xf32>, %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  tensor.extract_slice
-  // CHECK-NEXT:  scf.foreach_thread.perform_concurrently
+  // CHECK-NEXT:  scf.forall.in_parallel
   // CHECK-NEXT:  tensor.parallel_insert_slice
   // CHECK-NEXT:  }
   // CHECK-NEXT:  }
   // CHECK-NEXT:  return
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
@@ -333,23 +333,23 @@
   return
 }
 
-// CHECK-LABEL: func.func @explicit_loop_bounds_foreach_thread
-func.func @explicit_loop_bounds_foreach_thread(%in: tensor<100xf32>,
+// CHECK-LABEL: func.func @explicit_loop_bounds_forall
+func.func @explicit_loop_bounds_forall(%in: tensor<100xf32>,
     %out: tensor<100xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  tensor.extract_slice
-  // CHECK-NEXT:  scf.foreach_thread.perform_concurrently
+  // CHECK-NEXT:  scf.forall.in_parallel
   // CHECK-NEXT:  tensor.parallel_insert_slice
   // CHECK-NEXT:  }
   // CHECK-NEXT:  }
   // CHECK-NEXT:  return
-  %result = scf.foreach_thread (%thread_idx) =  (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
+  %result = scf.forall (%thread_idx) =  (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
@@ -357,32 +357,32 @@
   return
 }
 
-// CHECK-LABEL: func.func @normalized_foreach_thread_elide_terminator
-func.func @normalized_foreach_thread_elide_terminator() -> () {
+// CHECK-LABEL: func.func @normalized_forall_elide_terminator
+func.func @normalized_forall_elide_terminator() -> () {
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  } {mapping = [#gpu.thread<x>]}
   // CHECK-NEXT:  return
-  scf.foreach_thread (%thread_idx) in (%num_threads) {
-    scf.foreach_thread.perform_concurrently {
+  scf.forall (%thread_idx) in (%num_threads) {
+    scf.forall.in_parallel {
     }
   } {mapping = [#gpu.thread<x>]}
   return
 
 }
 
-// CHECK-LABEL: func.func @explicit_loop_bounds_foreach_thread_elide_terminator
-func.func @explicit_loop_bounds_foreach_thread_elide_terminator() -> () {
+// CHECK-LABEL: func.func @explicit_loop_bounds_forall_elide_terminator
+func.func @explicit_loop_bounds_forall_elide_terminator() -> () {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  } {mapping = [#gpu.thread<x>]}
   // CHECK-NEXT:  return
-  scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
-    scf.foreach_thread.perform_concurrently {
+  scf.forall (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
+    scf.forall.in_parallel {
     }
   } {mapping = [#gpu.thread<x>]}
   return
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1531,12 +1531,12 @@
   %c1 = arith.constant 1 : index
 
   //  CHECK-NOT: tensor.cast
-  //      CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
-  // CHECK-NEXT:   scf.foreach_thread.perform_concurrently {
+  //      CHECK: scf.forall (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
+  // CHECK-NEXT:   scf.forall.in_parallel {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[arg0]] into %[[o]][%[[tidx]], 0] [1, 5] [1, 1]
-  %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+  %2 = scf.forall (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
     %3 = tensor.cast %arg0 : tensor<1x5xf32> to tensor<?x5xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %3 into %o[%tidx, %c0] [%c1, 5] [%c1, %c1] : tensor<?x5xf32> into tensor<?x?xf32>
     }
   }
@@ -1553,11 +1553,11 @@
 {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  //      CHECK: scf.foreach_thread () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
-  // CHECK-NEXT:   scf.foreach_thread.perform_concurrently {
+  //      CHECK: scf.forall () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
+  // CHECK-NEXT:   scf.forall.in_parallel {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[arg0]] into %[[o]][0, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<1x5xf32>
-  %2 = scf.foreach_thread () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
-    scf.foreach_thread.perform_concurrently {
+  %2 = scf.forall () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %arg0 into %o[%c0, %c0] [1, 5] [%c1, %c1] : tensor<1x5xf32> into tensor<1x5xf32>
     }
   }
diff --git a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
--- a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
+++ b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
@@ -28,11 +28,11 @@
 // FOREACH-DAG: %[[c5:.+]] = arith.constant 5 : index
 // FOREACH-DAG: %[[c7:.+]] = arith.constant 7 : index
 // FOREACH-DAG: %[[init:.+]] = tensor.empty() : tensor<20x11xf32>
-//     FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) in (20) shared_outs(%[[dest:.+]] = %[[init]])
+//     FOREACH: %[[tile:.+]] = scf.forall (%[[iv:.+]]) in (20) shared_outs(%[[dest:.+]] = %[[init]])
 //     FOREACH:   %[[multiIndex:.+]]:3 = affine.delinearize_index %[[iv]] into (%[[c3]], %[[c5]], %[[c7]]
 //     FOREACH:   %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex]]#0, %[[multiIndex]]#1, %[[multiIndex]]#2, 0] [1, 1, 1, 11] [1, 1, 1, 1] :
 //     FOREACH:   %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3]{{\]}} :
-//     FOREACH:   perform_concurrently
+//     FOREACH:   in_parallel
 // FOREACH-NEXT:   tensor.parallel_insert_slice %[[sliceFlat]] into %[[dest]][%[[iv]], 0] [1, 11] [1, 1] :
 //     FOREACH: return %[[tile]]
 
@@ -136,14 +136,14 @@
 // FOREACH-DAG:     %[[d1:.+]] = tensor.dim %[[arg0]], %[[c1]] :
 // FOREACH-DAG:     %[[d2:.+]] = tensor.dim %[[arg0]], %[[c2]] :
 // FOREACH-DAG:     %[[d4:.+]] = tensor.dim %[[arg0]], %[[c4]] :
-//     FOREACH:     %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
+//     FOREACH:     %[[tile1:.+]] = scf.forall (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
 // FOREACH-DAG:       %[[iv1:.+]] = affine.apply #[[map1]](%[[tid1]])[%[[lb1]]]
 //     FOREACH:       %[[multiIndex1:.+]]:3 = affine.delinearize_index %[[iv1]] into (%[[c3]], %[[d1]], %[[d2]]) :
 // FOREACH-DAG:       %[[iv2:.+]] = affine.apply #[[map1]](%[[tid2]])[%[[lb2]]]
 //     FOREACH:       %[[multiIndex2:.+]]:2 = affine.delinearize_index %[[iv2]] into (%[[c11]], %[[d4]]) :
 //     FOREACH:       %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex1]]#0, %[[multiIndex1]]#1, %[[multiIndex1]]#2, %[[multiIndex2]]#0, %[[multiIndex2]]#1] [1, 1, 1, 1, 1] [1, 1, 1, 1, 1] :
 //     FOREACH:       %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3, 4]{{\]}} :
-//     FOREACH:       perform_concurrently
+//     FOREACH:       in_parallel
 //FOREACH-NEXT:         tensor.parallel_insert_slice %[[sliceFlat]] into %[[dest]][%[[tid1]], %[[tid2]]] [1, 1] [1, 1] :
 
 // -----
diff --git a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
--- a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
+++ b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
@@ -90,9 +90,9 @@
 func.func @parallel_insert_slice(%t0: tensor<1x2xf32>, %t1: tensor<f32>, %t2: tensor<1x1xf32>) -> tensor<1x2xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %r = scf.foreach_thread (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
+  %r = scf.forall (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
     %inserted_slice = tensor.insert_slice %t1 into %t2[0, 0] [1, 1] [1, 1] : tensor<f32> into tensor<1x1xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %inserted_slice into %arg4[%arg2, %arg3] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x2xf32>
     }
   }
diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
--- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
+++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -44,8 +44,8 @@
   -> tensor<?x?x?x?xf32> {
   %0 = tensor.collapse_shape %t [[0, 1], [2], [3]]
       : tensor<?x1x1x5xf32> into tensor<?x1x5xf32>
-  %1 = scf.foreach_thread (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
-    scf.foreach_thread.perform_concurrently {
+  %1 = scf.forall (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %0 into %o[0, 0, 0, 0][%sz, 1, 1, 5][1, 1, 1, 1]
           : tensor<?x1x5xf32> into tensor<?x?x?x?xf32>
     }
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -196,10 +196,10 @@
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  // CHECK: scf.foreach_thread {{.*}} {
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
+  // CHECK: scf.forall {{.*}} {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         // CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
         // CHECK: memref.subview %{{.*}}[1, %{{.*}}] [1, 1] [1, 1] : memref<200x100xf32, strided<[?, ?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
         tensor.parallel_insert_slice %1 into %o[1, %thread_idx][1, 1][1, 1] :
diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
--- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
@@ -82,7 +82,7 @@
   Option<bool> useForeach{
       *this, "use-foreach",
       llvm::cl::desc(
-          "Use the scf.foreach_thread operation when generating loop nests for "
+          "Use the scf.forall operation when generating loop nests for "
           "the extract_slice of collapse_shape pattern"),
       llvm::cl::init(false)};
 
@@ -247,7 +247,7 @@
                                 tensor::ExtractSliceFromCollapseHelper &helper,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    auto foreachThreadOp = rewriter.create<scf::ForeachThreadOp>(
+    auto forallOp = rewriter.create<scf::ForallOp>(
         loc, /*numThreads=*/getAsOpFoldResult(helper.getIterationSpaceSizes()),
         /*outputs=*/dest,
         /*mapping=*/std::nullopt,
@@ -263,12 +263,12 @@
           auto [tile, insertParams] =
               helper.emitLoopNestBody(nestedBuilder, loc, outputIvs);
           // Insert the slice into the destination.
-          auto term = nestedBuilder.create<scf::PerformConcurrentlyOp>(loc);
+          auto term = nestedBuilder.create<scf::InParallelOp>(loc);
           nestedBuilder.setInsertionPointToStart(term.getBody());
           nestedBuilder.create<tensor::ParallelInsertSliceOp>(
               loc, tile, outputArgs[0], insertParams);
         });
-    rewriter.replaceOp(op, foreachThreadOp->getResult(0));
+    rewriter.replaceOp(op, forallOp->getResult(0));
     return success();
   }
 };
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -188,7 +188,7 @@
                                   [], transform.AnyOpType.get())
   types = [
       transform.OperationType.get(x)
-      for x in ["scf.for", "scf.parallel", "scf.foreach_thread"]
+      for x in ["scf.for", "scf.parallel", "scf.forall"]
   ]
   with InsertionPoint(sequence.body):
     structured.TileOp(types, sequence.bodyTarget, sizes=[2, 3, 4])
@@ -196,7 +196,7 @@
   # CHECK-LABEL: TEST: testTileExplicitLoopTypeAll
   # CHECK: = transform.structured.tile
   # CHECK-SAME : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
-  # CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.foreach_thread">
+  # CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.forall">
 
 @run
 def testVectorize():