diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -481,9 +481,12 @@
     }
     ```
   }];
-  let arguments = (ins Variadic<Index>:$num_threads,
-                       Variadic<AnyRankedTensor>:$outputs,
-                       OptionalAttr<DeviceMappingArrayAttr>:$mapping);
+  let arguments = (ins
+    Variadic<Index>:$lowerBound,
+    Variadic<Index>:$upperBound,
+    Variadic<Index>:$step,
+    Variadic<AnyRankedTensor>:$outputs,
+    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
 
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$region);
@@ -496,57 +499,67 @@
   let skipDefaultBuilders = 1;
   let builders = [
     // Bodyless builder, outputs must be specified.
-    OpBuilder<(ins "ValueRange":$outputs, "ValueRange":$num_threads,
-                   "std::optional<ArrayAttr>":$mapping)>,
+    OpBuilder<(ins "ValueRange":$lbs,  "ValueRange":$ubs, "ValueRange":$steps,
+                   "ValueRange":$outputs, "std::optional<ArrayAttr>":$mapping)>,
+
     // Builder that takes a bodyBuilder lambda.
-    OpBuilder<(ins "ValueRange":$outputs, "ValueRange":$num_threads,
-                   "ArrayRef<Attribute>":$mapping,
+    OpBuilder<(ins "ValueRange":$lbs,  "ValueRange":$ubs, "ValueRange":$steps,
+                   "ValueRange":$outputs, "ArrayRef<Attribute>":$mapping,
                    "function_ref<void(OpBuilder &, Location, ValueRange)>":$bodyBuilder)>
   ];
   let extraClassDeclaration = [{
-    int64_t getRank() { return getNumThreads().size(); }
+    int64_t getRank() { return getLowerBound().size(); }
+
+    /// Number of operands controlling the loop: lbs, ubs, steps
+    unsigned getNumControlOperands() { return 3 * getRank(); }
 
     OpResult getTiedOpResult(OpOperand *opOperand) {
-      assert(opOperand->getOperandNumber() >= getRank() && "invalid operand");
+      assert(opOperand->getOperandNumber() >= getNumControlOperands() &&
+             "invalid operand");
       return getOperation()->getOpResult(
-          opOperand->getOperandNumber() - getRank());
+          opOperand->getOperandNumber() - getNumControlOperands());
     }
 
     /// Return the num_threads operand that is tied to the given thread id
     /// block argument.
     OpOperand *getTiedOpOperand(BlockArgument bbArg) {
       assert(bbArg.getArgNumber() >= getRank() && "invalid bbArg");
-      return &getOperation()->getOpOperand(bbArg.getArgNumber());
+      return &getOperation()->getOpOperand(bbArg.getArgNumber() + 2 * getRank());
     }
 
     /// Return the shared_outs operand that is tied to the given OpResult.
     OpOperand *getTiedOpOperand(OpResult opResult) {
       assert(opResult.getDefiningOp() == getOperation() && "invalid OpResult");
       return &getOperation()->getOpOperand(
-          opResult.getResultNumber() + getRank());
+          opResult.getResultNumber() + getNumControlOperands());
     }
 
     BlockArgument getTiedBlockArgument(OpOperand *opOperand) {
-      assert(opOperand->getOperandNumber() >= getRank() && "invalid operand");
-      return getBody()->getArgument(opOperand->getOperandNumber());
+      assert(opOperand->getOperandNumber() >= getNumControlOperands()
+        && "invalid operand");
+      return getBody()->getArgument(
+        opOperand->getOperandNumber() - 2 * getRank());
     }
 
     ArrayRef<BlockArgument> getOutputBlockArguments() {
       return getBody()->getArguments().drop_front(getRank());
     }
 
-    ::mlir::ValueRange getThreadIndices() {
+    ::mlir::ValueRange getInductionVars() {
       return getBody()->getArguments().take_front(getRank());
     }
 
-    ::mlir::Value getThreadIndex(int64_t idx) {
-      return getThreadIndices()[idx];
+    ::mlir::Value getInductionVar(int64_t idx) {
+      return getInductionVars()[idx];
     }
 
     ::mlir::Block::BlockArgListType getRegionOutArgs() {
       return getBody()->getArguments().drop_front(getRank());
     }
 
+    /// Checks if the lbs are zeros and steps are ones.
+    bool isNormalized();
+
     /// Helper to sort `values` according to matching `keys`.
     /// Take a custom `compare` binary comparator which returns true if the first
     /// element is smaller than the second (i.e. compatible with std::sort).
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -180,15 +180,24 @@
   // transform op does not apply to individual ForeachThreadOp.
   Location loc = foreachThreadOp->getLoc();
 
+  if (!foreachThreadOp.isNormalized())
+    return transformOp.emitSilenceableError()
+           << "unsupported non-normalized loops";
   if (foreachThreadOp.getNumResults() > 0)
     return transformOp.emitSilenceableError()
            << "only bufferized scf.foreach_thread lowers to "
               "gpu.block_id";
-  if (foreachThreadOp.getNumThreads().size() > 3)
+  if (foreachThreadOp.getRank() > 3)
     return transformOp.emitSilenceableError()
            << "scf.foreach_thread with rank > 3 does not lower to "
               "gpu.block_id";
-  if (llvm::any_of(foreachThreadOp.getNumThreads(), [](Value v) {
+  if (llvm::any_of(foreachThreadOp.getLowerBound(), [](Value v) {
+        return !v.getDefiningOp<arith::ConstantIndexOp>();
+      })) {
+    return transformOp.emitSilenceableError()
+           << "unsupported dynamic griddim size";
+  }
+  if (llvm::any_of(foreachThreadOp.getUpperBound(), [](Value v) {
         return !v.getDefiningOp<arith::ConstantIndexOp>();
       })) {
     return transformOp.emitSilenceableError()
@@ -199,7 +208,7 @@
 
   // Step 1. Complete the blockMapping to a full mapping (with 1s) if necessary.
   SmallVector<Value> numBlocks =
-      llvm::to_vector(foreachThreadOp.getNumThreads());
+      llvm::to_vector(foreachThreadOp.getUpperBound());
   // Ensure we have 3 block sizes, one for each id.
   Value one;
   for (auto attr : mappingAttributes) {
@@ -227,7 +236,7 @@
   blockIdGenerator(rewriter, foreachThreadOp, blockOps);
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
-       llvm::zip(foreachThreadOp.getThreadIndices(), blockMapping)) {
+       llvm::zip(foreachThreadOp.getInductionVars(), blockMapping)) {
     bvm.map(blockIdx,
             blockOps[static_cast<int64_t>(
                 blockDim.cast<DeviceMappingAttrInterface>().getMappingId())]);
@@ -243,7 +252,7 @@
                                       sourceBlock.getOperations());
 
   // Step 5. RAUW thread indices to thread ops.
-  for (Value loopIndex : foreachThreadOp.getThreadIndices()) {
+  for (Value loopIndex : foreachThreadOp.getInductionVars()) {
     Value blockIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, blockIdx);
   }
@@ -381,13 +390,15 @@
     return emitDefiniteFailure(foreachThreadOp, message);
   };
   Location loc = foreachThreadOp->getLoc();
+  if (!foreachThreadOp.isNormalized())
+    return failureHelper("unsupported non-normalized loops");
   if (foreachThreadOp.getNumResults() > 0)
     return failureHelper(
         "only bufferized scf.foreach_thread lowers to gpu.thread_id");
-  if (foreachThreadOp.getNumThreads().size() > 3)
+  if (foreachThreadOp.getRank() > 3)
     return failureHelper(
         "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
-  if (llvm::any_of(foreachThreadOp.getNumThreads(), [](Value v) {
+  if (llvm::any_of(foreachThreadOp.getUpperBound(), [](Value v) {
         return !v.getDefiningOp<arith::ConstantIndexOp>();
       })) {
     return failureHelper("unsupported dynamic blockdim size");
@@ -400,7 +411,7 @@
   // Step 1. Complete the threadMapping to a full mapping (with 1s) if
   // necessary.
   SmallVector<Value> numThreads =
-      llvm::to_vector(foreachThreadOp.getNumThreads());
+      llvm::to_vector(foreachThreadOp.getUpperBound());
   // Ensure we have 3 block sizes, one for each id.
   Value one;
   for (auto attr : threadMappingAttributes) {
@@ -437,7 +448,7 @@
   }
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
-       llvm::zip(foreachThreadOp.getThreadIndices(), threadMapping)) {
+       llvm::zip(foreachThreadOp.getInductionVars(), threadMapping)) {
     bvm.map(blockIdx,
             threadOpsUpdated[blockDim.cast<DeviceMappingAttrInterface>()
                                  .getMappingId()]);
@@ -484,7 +495,7 @@
                                       sourceBlock.getOperations());
 
   // Step 6. RAUW thread indices to thread ops.
-  for (Value loopIndex : foreachThreadOp.getThreadIndices()) {
+  for (Value loopIndex : foreachThreadOp.getInductionVars()) {
     Value threadIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, threadIdx);
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -253,7 +253,7 @@
   OpBuilder::InsertionGuard g(b);
   b.setInsertionPointToStart(foreachThreadOp.getBody(0));
 
-  ValueRange threadIds = foreachThreadOp.getThreadIndices();
+  ValueRange threadIds = foreachThreadOp.getInductionVars();
   SmallVector<OpFoldResult> nonZeroNumThreads =
       llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
         return !isConstantIntValue(ofr, 0);
@@ -359,8 +359,12 @@
   // 1. Create the ForeachThreadOp. We don't use the lambda body-builder
   // version because we require the use of RewriterBase in the body, so we
   // manually move the insertion point to the body below.
+  unsigned numLoops = materializedNonZeroNumThreads.size();
+  SmallVector<Value> lbs(numLoops, b.create<arith::ConstantIndexOp>(loc, 0));
+  SmallVector<Value> steps(numLoops, b.create<arith::ConstantIndexOp>(loc, 1));
   scf::ForeachThreadOp foreachThreadOp = b.create<scf::ForeachThreadOp>(
-      loc, dest, ValueRange(materializedNonZeroNumThreads), mapping);
+      loc, lbs, ValueRange(materializedNonZeroNumThreads), steps, dest,
+      mapping);
 
   // 2. Fill out the ForeachThreadOp body.
   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
@@ -680,9 +684,12 @@
       getAsValues(b, loc, nonZeroNumThreads);
 
   // 2. Create the ForeachThreadOp with an empty region.
+  unsigned numLoops = materializedNonZeroNumThreads.size();
+  SmallVector<Value> lbs(numLoops, b.create<arith::ConstantIndexOp>(loc, 0));
+  SmallVector<Value> steps(numLoops, b.create<arith::ConstantIndexOp>(loc, 1));
   scf::ForeachThreadOp foreachThreadOp = b.create<scf::ForeachThreadOp>(
-      loc, (*identityTensor)->getResults(),
-      ValueRange(materializedNonZeroNumThreads), mapping);
+      loc, lbs, ValueRange(materializedNonZeroNumThreads), steps,
+      (*identityTensor)->getResults(), mapping);
 
   // 3. Calculate the tile offsets and sizes for the subsequent loop that will
   // be nested under `foreachThreadOp`.
@@ -712,7 +719,7 @@
                                            b.getIndexAttr(0));
       SmallVector<OpFoldResult> sizes = tiledSizes;
       sizes[reductionDim] = b.getIndexAttr(1);
-      outOffsets[reductionDim] = foreachThreadOp.getThreadIndices().front();
+      outOffsets[reductionDim] = foreachThreadOp.getInductionVars().front();
       // TODO: use SubsetExtractOpInterface once it is available.
       tiledDpsInitOperands.push_back(b.create<tensor::ExtractSliceOp>(
           loc, initOperand->get().getType().cast<RankedTensorType>(),
@@ -746,7 +753,7 @@
       if (failed(maybeTiled))
         return b.notifyMatchFailure(op, "failed tileLinalgOpImpl");
 
-      SmallVector<Value> ids = foreachThreadOp.getThreadIndices();
+      SmallVector<Value> ids = foreachThreadOp.getInductionVars();
       mapLoopToProcessorIds(cast<scf::ForOp>(maybeTiled->loops.back()), ids,
                             materializedNonZeroNumThreads);
       assert(maybeTiled->loops.size() == 1 &&
@@ -774,7 +781,7 @@
     int64_t sizeIdx = 0;
     for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
       if (i == reductionDim) {
-        resultOffsetsRank.push_back(foreachThreadOp.getThreadIndices().front());
+        resultOffsetsRank.push_back(foreachThreadOp.getInductionVars().front());
         resultSizesRank.push_back(b.getIndexAttr(1));
         continue;
       }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1142,11 +1142,9 @@
 }
 
 void ForeachThreadOp::print(OpAsmPrinter &p) {
-  p << " (";
-  llvm::interleaveComma(getThreadIndices(), p);
-  p << ") in (";
-  llvm::interleaveComma(getNumThreads(), p);
-  p << ")";
+  p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to ("
+    << getUpperBound() << ") step (" << getStep() << ") ";
+
   printInitializationList(p, getRegionOutArgs(), getOutputs(), " shared_outs");
   p << " ";
   if (!getRegionOutArgs().empty())
@@ -1164,17 +1162,30 @@
   // Parse an opening `(` followed by thread index variables followed by `)`
   // TODO: when we can refer to such "induction variable"-like handles from the
   // declarative assembly format, we can implement the parser as a custom hook.
-  SmallVector<OpAsmParser::Argument, 4> threadIndices;
-  if (parser.parseArgumentList(threadIndices, OpAsmParser::Delimiter::Paren))
+  SmallVector<OpAsmParser::Argument, 4> ivs;
+  if (parser.parseArgumentList(ivs, OpAsmParser::Delimiter::Paren))
+    return failure();
+
+  // Parse lower bounds.
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> lbs;
+  if (parser.parseEqual() ||
+      parser.parseOperandList(lbs, ivs.size(), OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(lbs, builder.getIndexType(), result.operands))
     return failure();
 
-  // Parse `in` threadNums.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> threadNums;
-  if (parser.parseKeyword("in") ||
-      parser.parseOperandList(threadNums, threadIndices.size(),
+  // Parse upper bounds.
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> ubs;
+  if (parser.parseKeyword("to") ||
+      parser.parseOperandList(ubs, ivs.size(), OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(ubs, builder.getIndexType(), result.operands))
+    return failure();
+
+  // Parse step values.
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> steps;
+  if (parser.parseKeyword("step") ||
+      parser.parseOperandList(steps, ivs.size(),
                               OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(threadNums, builder.getIndexType(),
-                             result.operands))
+      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
     return failure();
 
   // Parse out operands and results.
@@ -1195,9 +1206,9 @@
   // Parse region.
   SmallVector<OpAsmParser::Argument, 4> regionArgs;
   std::unique_ptr<Region> region = std::make_unique<Region>();
-  for (auto &idx : threadIndices) {
-    idx.type = builder.getIndexType();
-    regionArgs.push_back(idx);
+  for (auto &iv : ivs) {
+    iv.type = builder.getIndexType();
+    regionArgs.push_back(iv);
   }
   for (const auto &it : llvm::enumerate(regionOutArgs)) {
     auto &out = it.value();
@@ -1215,19 +1226,24 @@
   // Parse the optional attribute list.
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
-  result.addAttribute("operand_segment_sizes",
-                      parser.getBuilder().getDenseI32ArrayAttr(
-                          {static_cast<int32_t>(threadNums.size()),
-                           static_cast<int32_t>(outOperands.size())}));
+  result.addAttribute(
+      "operand_segment_sizes",
+      parser.getBuilder().getDenseI32ArrayAttr(
+          {static_cast<int32_t>(lbs.size()), static_cast<int32_t>(ubs.size()),
+           static_cast<int32_t>(steps.size()),
+           static_cast<int32_t>(outOperands.size())}));
   return success();
 }
 
 // Bodyless builder, outputs must be specified.
 void ForeachThreadOp::build(mlir::OpBuilder &builder,
-                            mlir::OperationState &result, ValueRange outputs,
-                            ValueRange numThreads,
+                            mlir::OperationState &result, ValueRange lbs,
+                            ValueRange ubs, ValueRange steps,
+                            ValueRange outputs,
                             std::optional<ArrayAttr> mapping) {
-  result.addOperands(numThreads);
+  result.addOperands(lbs);
+  result.addOperands(ubs);
+  result.addOperands(steps);
   result.addOperands(outputs);
   if (mapping.has_value()) {
     result.addAttribute(ForeachThreadOp::getMappingAttrName(result.name),
@@ -1236,7 +1252,9 @@
 
   result.addAttribute(
       "operand_segment_sizes",
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(numThreads.size()),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lbs.size()),
+                                    static_cast<int32_t>(ubs.size()),
+                                    static_cast<int32_t>(steps.size()),
                                     static_cast<int32_t>(outputs.size())}));
   result.addTypes(TypeRange(outputs));
 
@@ -1248,9 +1266,8 @@
   builder.createBlock(bodyRegion);
   Block &bodyBlock = bodyRegion->front();
   // Add block arguments for indices and outputs.
-  bodyBlock.addArguments(
-      SmallVector<Type>(numThreads.size(), builder.getIndexType()),
-      SmallVector<Location>(numThreads.size(), result.location));
+  bodyBlock.addArguments(SmallVector<Type>(lbs.size(), builder.getIndexType()),
+                         SmallVector<Location>(lbs.size(), result.location));
   bodyBlock.addArguments(
       TypeRange(outputs),
       SmallVector<Location>(outputs.size(), result.location));
@@ -1259,16 +1276,21 @@
 
 // Builder that takes a bodyBuilder lambda.
 void ForeachThreadOp::build(
-    mlir::OpBuilder &builder, mlir::OperationState &result, ValueRange outputs,
-    ValueRange numThreads, ArrayRef<Attribute> mapping,
+    mlir::OpBuilder &builder, mlir::OperationState &result, ValueRange lbs,
+    ValueRange ubs, ValueRange steps, ValueRange outputs,
+    ArrayRef<Attribute> mapping,
     function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {
-  result.addOperands(numThreads);
+  result.addOperands(lbs);
+  result.addOperands(ubs);
+  result.addOperands(steps);
   result.addOperands(outputs);
   result.addAttribute(ForeachThreadOp::getMappingAttrName(result.name),
                       builder.getArrayAttr(mapping));
   result.addAttribute(
       "operand_segment_sizes",
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(numThreads.size()),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lbs.size()),
+                                    static_cast<int32_t>(ubs.size()),
+                                    static_cast<int32_t>(steps.size()),
                                     static_cast<int32_t>(outputs.size())}));
   result.addTypes(TypeRange(outputs));
 
@@ -1277,9 +1299,8 @@
   builder.createBlock(bodyRegion);
   Block &bodyBlock = bodyRegion->front();
   // Add block arguments for indices and outputs.
-  bodyBlock.addArguments(
-      SmallVector<Type>(numThreads.size(), builder.getIndexType()),
-      SmallVector<Location>(numThreads.size(), result.location));
+  bodyBlock.addArguments(SmallVector<Type>(lbs.size(), builder.getIndexType()),
+                         SmallVector<Location>(lbs.size(), result.location));
   bodyBlock.addArguments(
       TypeRange(outputs),
       SmallVector<Location>(outputs.size(), result.location));
@@ -1294,6 +1315,15 @@
 #endif // NDEBUG
 }
 
+// Checks if the lbs are zeros and steps are ones.
+bool ForeachThreadOp::isNormalized() {
+  return llvm::all_of(
+             getLowerBound(),
+             [](Value v) { return matchPattern(v, mlir::m_Zero()); }) &&
+         llvm::all_of(getStep(),
+                      [](Value v) { return matchPattern(v, m_One()); });
+}
+
 // The ensureTerminator method generated by SingleBlockImplicitTerminator is
 // unaware of the fact that our terminator also needs a region to be
 // well-formed. We override it here to ensure that we do the right thing.
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1004,15 +1004,15 @@
 
 /// Return `true` if the given loop may have 0 iterations.
 bool mayHaveZeroIterations(scf::ForeachThreadOp foreachThreadOp) {
-  int64_t p = 1;
-  for (Value v : foreachThreadOp.getNumThreads()) {
-    if (std::optional<int64_t> c = getConstantIntValue(v)) {
-      p *= *c;
-    } else {
+  for (auto [lb, ub] : llvm::zip(foreachThreadOp.getLowerBound(),
+                                 foreachThreadOp.getUpperBound())) {
+    std::optional<int64_t> lbConst = getConstantIntValue(lb);
+    std::optional<int64_t> ubConst = getConstantIntValue(ub);
+    if (!lbConst.has_value() || !ubConst.has_value() || *lbConst >= *ubConst) {
       return true;
     }
   }
-  return p == 0;
+  return false;
 }
 
 /// Bufferization of ForeachThreadOp. This also bufferizes the terminator of the
@@ -1087,8 +1087,9 @@
     rewriter.setInsertionPoint(foreachThreadOp);
     ForeachThreadOp newForeachThreadOp;
     newForeachThreadOp = rewriter.create<ForeachThreadOp>(
-        foreachThreadOp.getLoc(), /*outputs=*/ValueRange(),
-        foreachThreadOp.getNumThreads(), foreachThreadOp.getMapping());
+        foreachThreadOp.getLoc(), foreachThreadOp.getLowerBound(),
+        foreachThreadOp.getUpperBound(), foreachThreadOp.getStep(),
+        /*outputs=*/ValueRange(), foreachThreadOp.getMapping());
 
     newForeachThreadOp.getBody()->getTerminator()->erase();
 
@@ -1127,10 +1128,27 @@
 
   bool isRepetitiveRegion(Operation *op, unsigned index) const {
     auto foreachThreadOp = cast<ForeachThreadOp>(op);
-    // This op is not repetitive if it has just a single thread.
-    return !llvm::all_of(foreachThreadOp.getNumThreads(), [](Value v) {
-      return getConstantIntValue(v) == static_cast<int64_t>(1);
-    });
+
+    // This op is not repetitive if it has just a single step.
+    for (auto [lb, ub, step] : llvm::zip(foreachThreadOp.getLowerBound(),
+                                         foreachThreadOp.getUpperBound(),
+                                         foreachThreadOp.getStep())) {
+      std::optional<int64_t> lbConstant = getConstantIntValue(lb);
+      if (!lbConstant)
+        return true;
+
+      std::optional<int64_t> ubConstant = getConstantIntValue(ub);
+      if (!ubConstant)
+        return true;
+
+      std::optional<int64_t> stepConstant = getConstantIntValue(step);
+      if (!stepConstant)
+        return true;
+
+      if (*lbConstant + *stepConstant < *ubConstant)
+        return true;
+    }
+    return false;
   }
 };
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
@@ -180,10 +180,10 @@
       if (scf::ForeachThreadOp foreachThreadOp =
               scf::getForeachThreadOpThreadIndexOwner(iv)) {
         for (int64_t idx = 0; idx < foreachThreadOp.getRank(); ++idx) {
-          if (foreachThreadOp.getThreadIndices()[idx] == iv) {
-            lb = OpBuilder(iv.getContext()).getIndexAttr(0);
-            ub = foreachThreadOp.getNumThreads()[idx];
-            step = OpBuilder(iv.getContext()).getIndexAttr(1);
+          if (foreachThreadOp.getInductionVar(idx) == iv) {
+            lb = foreachThreadOp.getLowerBound()[idx];
+            ub = foreachThreadOp.getUpperBound()[idx];
+            step = foreachThreadOp.getStep()[idx];
             return success();
           }
         }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -144,14 +144,14 @@
 // CHECK-SAME:   %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
 // CHECK-SAME:   %[[sz:[0-9a-zA-Z]*]]: index
 func.func @parallel_insert_slice(
-  %t: tensor<?xf32> {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true},
-  %sz: index)
-    -> (tensor<?xf32>)
-{
+    %t: tensor<?xf32> {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true},
+    %sz: index) -> (tensor<?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %f0 = arith.constant 0.0: f32
   %c512 = arith.constant 512 : index
 
-  %r1 = scf.foreach_thread (%iv) in (%c512) shared_outs(%o = %t) -> (tensor<?xf32>) {
+  %r1 = scf.foreach_thread (%iv) = (%c0) to (%c512) step (%c1) shared_outs(%o = %t) -> (tensor<?xf32>) {
     // tensor.empty itself does not alloc but forwards to the insert_slice.
     // EmptyTensorOpElimination replaces the tensor.empty with an inplace
     // extract_slice.
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -14,14 +14,15 @@
 // -----
 
 func.func @map_nested_foreach_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c900 = arith.constant 900 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -30,10 +31,10 @@
     gpu.terminator
   }
 
-  %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -55,14 +56,15 @@
 // -----
 
 func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c900 = arith.constant 900 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -71,10 +73,10 @@
     gpu.terminator
   }
 
-  %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -96,12 +98,13 @@
 // -----
 
 func.func @map_nested_foreach_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c900 = arith.constant 900 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -122,9 +125,9 @@
 // -----
 
 func.func @map_nested_foreach_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) {
-  %one = arith.constant 1 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %c1 = arith.constant 1 : index
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
     %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32>
     gpu.terminator
@@ -159,22 +162,23 @@
 // -----
 
 func.func @map_foreach_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c900 = arith.constant 900 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
   // expected-note @below {{when applied to this payload op}}
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
 
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -197,19 +201,20 @@
 
 // expected-note @below {{when applied to this payload op}}
 func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c65537 = arith.constant 65536 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
 
-  scf.foreach_thread (%i, %j) in (%c7, %c65537) {
+  scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c65537) step (%c1, %c1) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
       memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
   }  { mapping = [#gpu.thread<x>, #gpu.thread<y>] }
 
-  scf.foreach_thread (%i, %j) in (%c7, %c9) {
+  scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
@@ -229,9 +234,10 @@
 // -----
 
 func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c65535 = arith.constant 65535 : index
-  scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+  scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c65535, %c65535) step (%c1, %c1) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
@@ -251,12 +257,13 @@
 
 !type = memref<32x32xf32>
 func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c32 = arith.constant 32 : index
-  %one = arith.constant 1 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c32, %c32) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = arith.mulf %4, %5 : f32
@@ -277,9 +284,9 @@
 // -----
 
 func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) {
-  %one = arith.constant 1 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %c1 = arith.constant 1 : index
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
     // expected-error @below {{'linalg.generic' op must have "tensor semantic" for tiling}}
     // expected-note @below {{when applied to this op}}
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -10,6 +10,7 @@
 func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
+  %zero = arith.constant 0 : index
   %one = arith.constant 1 : index
 //      CHECK:   gpu.launch
 //      CHECK:   %[[BLKX:.*]] = gpu.block_id  x
@@ -19,7 +20,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c7, %c9) step (%one, %one) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
@@ -46,6 +47,7 @@
 // CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
 // CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
 func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
+  %zero = arith.constant 0 : index
   %one = arith.constant 1 : index
   %c12 = arith.constant 12 : index
   %c9 = arith.constant 9 : index
@@ -68,13 +70,13 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c7, %c9) step (%one, %one) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : !type
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.foreach_thread (%i) in (%c12) {
+     scf.foreach_thread (%i) = (%zero) to (%c12) step (%one) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
@@ -98,22 +100,24 @@
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32>
 // CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32>
 func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
   %c32 = arith.constant 32 : index
   %c64 = arith.constant 64 : index
-  %c4 = arith.constant 4 : index
-//      CHECK:   %[[C32:.*]] = arith.constant 32 : index
-//      CHECK:   %[[C64:.*]] = arith.constant 64 : index
-//      CHECK:   %[[C4:.*]] = arith.constant 4 : index
-//      CHECK:   %[[C1:.*]] = arith.constant 1 : index
-//      CHECK:   gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]])
-//      CHECK:   %[[BLKX:.*]] = gpu.block_id  x
-//      CHECK:   %[[BLKY:.*]] = gpu.block_id  y
-//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
-//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
-//      CHECK:   memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
-//      CHECK:   memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
-  scf.foreach_thread (%i, %j) in (%c32, %c64) {
-    scf.foreach_thread (%k, %l) in (%c4, %c32) {
+// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+// CHECK-DAG: %[[C64:.*]] = arith.constant 64 : index
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK:     gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]])
+// CHECK:     %[[BLKX:.*]] = gpu.block_id  x
+// CHECK:     %[[BLKY:.*]] = gpu.block_id  y
+// CHECK:     %[[TIDX:.*]] = gpu.thread_id  x
+// CHECK:     %[[TIDY:.*]] = gpu.thread_id  y
+// CHECK:     memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
+// CHECK:     memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
+  scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c32, %c64) step (%c1, %c1) {
+    scf.foreach_thread (%k, %l) = (%c0, %c0) to (%c4, %c32) step (%c1, %c1) {
       %4 = memref.load %x[%i, %j, %k, %l] : !type4d
       %5 = memref.load %y[%i, %j, %k, %l] : !type4d
       %6 = math.fma %alpha, %4, %5 : f32
@@ -137,6 +141,7 @@
 
 // CHECK-LABEL: func.func @saxpy2d_no_barrier(
 func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
+  %zero = arith.constant 0 : index
   %one = arith.constant 1 : index
   %c12 = arith.constant 12 : index
   %c9 = arith.constant 9 : index
@@ -146,7 +151,7 @@
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c7, %c9) step (%one, %one) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
@@ -170,15 +175,16 @@
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x32xf32>
 // CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x32xf32>
 func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c32 = arith.constant 32 : index
-  %one = arith.constant 1 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
 //      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
 //      CHECK:   memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]]
 //      CHECK:   memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]]
-    scf.foreach_thread (%i) in (%c32) {
+    scf.foreach_thread (%i) = (%c0) to  (%c32) step (%c1) {
         %4 = memref.load %x[%i, %i] : !type
         %5 = memref.load %y[%i, %i] : !type
         %6 = arith.mulf %4, %5 : f32
@@ -202,16 +208,17 @@
 
 // CHECK-LABEL: func.func @saxpy3d_fold_id_z(
 func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c12 = arith.constant 12 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
 //  CHECK: %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-NOT:   gpu.thread_id  z
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j, %k) in (%one, %c7, %c9) {
+    scf.foreach_thread (%i, %j, %k)  = (%c0, %c0, %c0) to (%c1, %c7, %c9) step (%c1, %c1, %c1) {
 //      CHECK:   memref.load %{{.*}}[%[[C0]],
 //      CHECK:   memref.load %{{.*}}[%[[C0]],
         %4 = memref.load %x[%i, %j, %k] : !type
@@ -238,23 +245,24 @@
 
 // CHECK-LABEL: func.func @map_multi_level(
 func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
-  %one = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c12 = arith.constant 12 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
 // check that the thread level got distributed but not the warp level.
 //  CHECK-NOT:  {mapping = #gpu.thread
 //      CHECK:  {mapping = [#gpu.warp<x>]}
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : !type
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.foreach_thread (%i) in (%c12) {
+     scf.foreach_thread (%i) = (%c0) to (%c12) step (%c1) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -876,11 +876,13 @@
 // -----
 
 func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<4x2xf32>
-  %res = scf.foreach_thread (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
+  %res = scf.foreach_thread (%arg0, %arg1) = (%c0, %c0) to (%c4, %c2) step (%c1, %c1) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
     %1 = tensor.empty() : tensor<1x1xf32>
     %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
     scf.foreach_thread.perform_concurrently {
diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
--- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
@@ -15,7 +15,9 @@
   func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //  CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
   //  CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index
-  //      CHECK: scf.foreach_thread ({{.*}}) in (%[[C10]], %[[C20]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
+  //      CHECK: scf.foreach_thread
+  // CHECK-SAME:     to (%[[C10]], %[[C20]])
+  // CHECK-SAME:     shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
   //      CHECK:   %[[tA:.*]] = tensor.extract_slice %[[A]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[tB:.*]] = tensor.extract_slice %[[B]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
@@ -60,7 +62,9 @@
   //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size_1]]]
   //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map0]]()[%[[N]], %[[tile_size_2]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) =
+  // CHECK-SAME:     to (%[[NT0]], %[[NT1]])
+  // CHECK-SAME:     shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
@@ -97,7 +101,9 @@
 func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
   //  CHECK-DAG: %[[c10:.+]] = arith.constant 10 : index
   //  CHECK-DAG: %[[c21:.+]] = arith.constant 21 : index
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[c10]], %[[c21]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) =
+  // CHECK-SAME:     to (%[[c10]], %[[c21]])
+  // CHECK-SAME:     shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV1]])
   //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
   //  CHECK-NOT:   affine.min
@@ -140,7 +146,9 @@
   //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
   //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) =
+  // CHECK-SAME:    to (%[[NT0]], %[[NT1]])
+  // CHECK-SAME:    shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
   //      CHECK:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
   //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
@@ -177,7 +185,9 @@
 func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
   //  CHECK-DAG: %[[c10:.+]] = arith.constant 10 :
   //  CHECK-DAG: %[[c15:.+]] = arith.constant 15 :
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[c10]], %[[c15]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) =
+  // CHECK-SAME:     to (%[[c10]], %[[c15]])
+  // CHECK-SAME:     shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
   //  CHECK-NOT:   affine.max
   //  CHECK-NOT:   affine.min
@@ -226,7 +236,9 @@
 
 // CHECK-LABEL: extract_source(
 //       CHECK:  %[[C2:.*]] = arith.constant 2 : index
-//       CHECK:  scf.foreach_thread (%[[ARG:.*]]) in (%[[C2]]) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
+//       CHECK:  scf.foreach_thread (%[[ARG:.*]]) =
+//  CHECK-SAME:      to (%[[C2]])
+//  CHECK-SAME:      shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
 //       CHECK:    %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]])
 //       CHECK:    scf.foreach_thread.perform_concurrently {
 //       CHECK:      tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32>
@@ -254,7 +266,9 @@
   //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size]]]
   //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) =
+  // CHECK-SAME:     to (%[[NT0]], %[[NT1]])
+  // CHECK-SAME:     shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
@@ -290,7 +304,9 @@
                                          %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
                                          -> (tensor<100xf32>, tensor<100xf32>) {
 //  CHECK-DAG: %[[c0:.+]] = arith.constant 7 :
-//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (%[[c0]]) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
+//      CHECK: scf.foreach_thread (%[[IV0:.+]]) =
+// CHECK-SAME:    to (%[[c0]])
+// CHECK-SAME:    shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
 //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]])
 //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
 //  CHECK-NOT:   affine.min
@@ -346,7 +362,9 @@
                      %OUT1: tensor<300x100xf32>, %OUT2: tensor<300xf32>)
                      -> (tensor<300x100xf32>, tensor<300xf32>) {
 //  CHECK-DAG: %[[c0:.+]] = arith.constant 4 :
-//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (%[[c0]]) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
+//      CHECK: scf.foreach_thread (%[[IV0:.+]]) =
+// CHECK-SAME:     to (%[[c0]])
+// CHECK-SAME:     shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
 //      CHECK:   %[[LB:.+]] = affine.apply #[[$map0]](%[[IV0]])
 //      CHECK:   %[[tIN1:.+]] = tensor.extract_slice %[[IN2]][0, %[[LB]]] [100, 75]
 //      CHECK:   %[[tIN2:.+]] = tensor.extract_slice %[[IN3]][%[[LB]]] [75]
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -12,12 +12,13 @@
   func.func @fuse_tileable_op(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?xf32>) -> tensor<?xf32>
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
     // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+    %2 = scf.foreach_thread (%arg3) = (%c0) to (%1) step (%c1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -63,11 +64,13 @@
   //  CHECK-SAME:   %[[IN:[0-9a-z]+]]: tensor<64xf32>
   //  CHECK-SAME:   %[[OUT:[0-9a-z]+]]: tensor<64xf32>
   func.func @fuse_untileable_op(%arg0: index, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<64xf32> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %0 = tensor.empty(%arg0) : tensor<?xf32>
     %1 = affine.apply #map0()[%arg0]
 
     // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<64xf32>) {
+    %2 = scf.foreach_thread (%arg3) = (%c0) to (%1) step (%c1) shared_outs(%o = %arg2) -> (tensor<64xf32>) {
       // CHECK: %[[INIT_TENSOR:.*]] = tensor.empty
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%arg0]
@@ -108,13 +111,14 @@
   func.func @fuse_tileable_op_rank_reducing(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
 
     // CHECK: scf.foreach_thread {{.*}} -> (tensor<?xf32>) {
-    %2 = scf.foreach_thread (%arg3) in (%d0) shared_outs(%o = %0) -> (tensor<?xf32>) {
+    %2 = scf.foreach_thread (%arg3) = (%c0) to (%d0) step (%c1) shared_outs(%o = %0) -> (tensor<?xf32>) {
       %5 = tensor.extract_slice %o[%arg3] [1] [1] : tensor<?xf32> to tensor<f32>
-      
+
       // CHECK: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<?xf32> to tensor<1xf32>
       // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : tensor<1xf32>) -> tensor<1xf32>
       // CHECK: tensor.extract_slice %{{.*}}[0] [1] [1] : tensor<1xf32> to tensor<f32>
@@ -154,12 +158,14 @@
   func.func @fuse_tileable_op_through_bbarg(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor<?xf32>) {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %0) -> (tensor<?xf32>) {
+    // CHECK:      scf.foreach_thread
+    // CHECK-SAME:   shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor<?xf32>) {
+    %2 = scf.foreach_thread (%arg3) = (%c0) to (%1) step (%c1) shared_outs(%o = %0) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -204,6 +210,7 @@
   func.func @fuse_tileable_multi_output_op(%idx: index, %in: tensor<?xf32>, %out_1: tensor<?xf32>, %out_2: tensor<?xf32>, %out_3: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
 
     %0:2 = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
@@ -219,7 +226,7 @@
     %1 = affine.apply #map0()[%d0, %idx]
 
     // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%i) in (%1) shared_outs(%o = %out_2) -> (tensor<?xf32>) {
+    %2 = scf.foreach_thread (%i) = (%c0) to (%1) step (%c1) shared_outs(%o = %out_2) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%i)[%idx]
       %4 = affine.min #map2(%i)[%d0, %idx]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -128,7 +128,9 @@
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) = 
+//CHECK-SAME:       to (%[[C5]])
+//CHECK-SAME:       shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
 // CHECK-DAG:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
@@ -183,7 +185,9 @@
 // CHECK-DAG:   %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor<?x?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) =
+//CHECK-SAME:       to (%[[C5]])
+//CHECK-SAME:       shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
 // CHECK-DAG:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?x5xf32> to tensor<?x?xf32>
@@ -241,7 +245,9 @@
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) =
+//CHECK-SAME:     to (%[[C5]])
+//CHECK-SAME:     shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
 //     CHECK:     %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:     %[[LB:.+]] = affine.apply #[[MAP0]]()[%[[IV]]]
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1484,9 +1484,10 @@
     %arg0 : tensor<1x5xf32>, %arg1: tensor<?x?xf32>, %num_threads : index) -> index
 {
   // CHECK: %[[c1:.*]] = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
 
-  %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+  %2 = scf.foreach_thread (%tidx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
     scf.foreach_thread.perform_concurrently {
       tensor.parallel_insert_slice %arg0 into %o[%tidx, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<?x?xf32>
     }
diff --git a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
--- a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
+++ b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
@@ -2,13 +2,15 @@
 
 func.func @reduce() {
   // CHECK: %[[C64:.*]] = arith.constant 64 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %cst_0 = arith.constant -0.000000e+00 : f32
   %0 = memref.alloc() : memref<128x384xf32>
   linalg.fill ins(%cst_0 : f32) outs(%0 : memref<128x384xf32>)
   %2 = memref.alloc() : memref<128xf32>
   linalg.fill ins(%cst_0 : f32) outs(%2 : memref<128xf32>)
-  scf.foreach_thread (%arg0) in (%c2) {
+  scf.foreach_thread (%arg0) = (%c0) to (%c2) step (%c1) {
     %7 = affine.min affine_map<(d0) -> (d0 * -64 + 128, 64)>(%arg0)
     %8 = affine.max affine_map<(d0) -> (0, d0)>(%7)
     %9 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -544,11 +544,12 @@
 // -----
 
 func.func @wrong_num_results(%in: tensor<100xf32>, %out: tensor<100xf32>) {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
   // expected-error @+1 {{1 operands present, but expected 2}}
-  %result:2 = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
+  %result:2 = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       scf.foreach_thread.perform_concurrently {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
@@ -561,10 +562,11 @@
 // -----
 
 func.func @invalid_insert_dest(%in: tensor<100xf32>, %out: tensor<100xf32>) {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
+  %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       scf.foreach_thread.perform_concurrently {
         // expected-error @+1 {{may only insert into an output block argument}}
@@ -578,10 +580,11 @@
 // -----
 
 func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
+  %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
       scf.foreach_thread.perform_concurrently {
@@ -596,10 +599,11 @@
 // -----
 
 func.func @mismatched_mapping(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+  %zero = arith.constant 0 : index
   %one = arith.constant 1 : index
   %c65535 = arith.constant 65535 : index
   // expected-error @below {{'scf.foreach_thread' op mapping attribute size must match op rank}}
-  scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+  scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c65535, %c65535) step (%one, %one) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -612,9 +612,10 @@
   -> tensor<320xf32>
 {
   %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %cst = arith.constant -0.000000e+00 : f32
   %c320 = arith.constant 320 : index
-  %4 = scf.foreach_thread (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
+  %4 = scf.foreach_thread (%arg0) = (%c0) to (%c320) step (%c1) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
     %5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1]  : tensor<320x10240xf32> to tensor<1x10240xf32>
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
@@ -108,18 +108,19 @@
 
 // -----
 
-// CHECK-LABEL: func @scf_foreach_thread_out_of_place(
+// CHECK-LABEL: func @scf.foreach_thread_out_of_place(
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32>
-// CHECK-FUNC-LABEL: func @scf_foreach_thread_out_of_place(
-func.func @scf_foreach_thread_out_of_place(%in: tensor<100xf32>,
+// CHECK-FUNC-LABEL: func @scf.foreach_thread_out_of_place(
+func.func @scf.foreach_thread_out_of_place(%in: tensor<100xf32>,
                                            %out: tensor<100xf32>) {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
   // CHECK-FUNC-NOT: alloc_tensor
   // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() copy(%[[arg1]]) {bufferization.escape = [false]} : tensor<100xf32>
   // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
+  %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
       // CHECK: tensor.extract_slice
       // CHECK: scf.foreach_thread.perform_concurrently
       // CHECK: tensor.parallel_insert_slice %{{.*}} into %[[o]]
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -543,8 +543,8 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
 
-  // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
-  %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+  // CHECK: scf.foreach_thread (%[[tidx:.*]]) to (%[[idx2]])
+  %2 = scf.foreach_thread (%arg3) = (%c0) to (%idx2) step (%c1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       // CHECK: %[[subview:.*]] = memref.subview %[[arg2]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
       // CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview]] : memref<?xf32
@@ -589,8 +589,8 @@
   // CHECK: %[[alloc1:.*]] = memref.alloc
   // CHECK: memref.copy %[[arg2]], %[[alloc1]]
 
-  // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
-  %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+  // CHECK: scf.foreach_thread (%[[tidx:.*]]) to (%[[idx2]])
+  %2 = scf.foreach_thread (%arg3) = (%c0) to (%idx2) step (%c1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       // CHECK: %[[subview1:.*]] = memref.subview %[[alloc1]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
 
@@ -625,12 +625,15 @@
 #map1 = affine_map<(d0) -> (d0 * 2)>
 
 // CHECK-LABEL: func.func @matmul
-func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<8x8xf32> {bufferization.writable = true}) -> tensor<8x8xf32> {
+func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>,
+    %arg2: tensor<8x8xf32> {bufferization.writable = true}) -> tensor<8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
 
-  // CHECK: scf.foreach_thread {{.*}}
-  %0 = scf.foreach_thread (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
+  // CHECK: scf.foreach_thread
+  %0 = scf.foreach_thread (%arg3, %arg4) = (%c0, %c0) to (%c2, %c4) step (%c1, %c1) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
     %1 = affine.apply #map0(%arg3)
     %3 = tensor.extract_slice %arg0[%1, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32>
     %4 = affine.apply #map1(%arg4)
@@ -651,6 +654,8 @@
 // CHECK-LABEL: func @scf_foreach_private_var(
 //  CHECK-SAME:     %[[t:.*]]: memref<10xf32
 func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c5 = arith.constant 5 : index
 
@@ -658,13 +663,13 @@
   // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: memref.copy %[[t]], %[[t_copy]]
 
-  // CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) {
+  // CHECK: scf.foreach_thread
 
   // Load from the copy and store into the shared output.
   // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
   // CHECK:   memref.load %[[t_copy]]
   // CHECK:   memref.store %{{.*}}, %[[subview]]
-  %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
+  %0 = scf.foreach_thread (%tid) = (%c0) to (%c2) step (%c1) shared_outs(%o = %t) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
@@ -686,13 +691,15 @@
 //  CHECK-SAME:     %[[t0:.*]]: memref<10xf32, {{.*}}>, %[[t1:.*]]: memref<10xf32
 func.func @scf_foreach_privatized_but_not_copied(
     %t0: tensor<10xf32>, %t1: tensor<10xf32>) -> f32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c5 = arith.constant 5 : index
 
   // CHECK-NOT: memref.alloc
   // CHECK-NOT: memref.copy
-  // CHECK: scf.foreach_thread {{.*}} {
-  %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
+  // CHECK: scf.foreach_thread
+  %0 = scf.foreach_thread (%tid) = (%c0) to (%c2) step (%c1) shared_outs(%o = %t0) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
diff --git a/mlir/test/Dialect/SCF/ops.mlir b/mlir/test/Dialect/SCF/ops.mlir
--- a/mlir/test/Dialect/SCF/ops.mlir
+++ b/mlir/test/Dialect/SCF/ops.mlir
@@ -313,6 +313,7 @@
 
 // CHECK-LABEL: func.func @simple_example
 func.func @simple_example(%in: tensor<100xf32>, %out: tensor<100xf32>) {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
@@ -323,7 +324,7 @@
   // CHECK-NEXT:  }
   // CHECK-NEXT:  }
   // CHECK-NEXT:  return
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
+  %result = scf.foreach_thread (%thread_idx) =  (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       scf.foreach_thread.perform_concurrently {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
@@ -335,12 +336,14 @@
 
 // CHECK-LABEL: func.func @elide_terminator
 func.func @elide_terminator() -> () {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
   //      CHECK:    scf.foreach_thread
   // CHECK-NEXT:  } {mapping = [#gpu.thread<x>]}
   // CHECK-NEXT:  return
-  scf.foreach_thread (%thread_idx) in (%num_threads) {
+  scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
     scf.foreach_thread.perform_concurrently {
     }
   } {mapping = [#gpu.thread<x>]}
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1531,10 +1531,12 @@
   %c1 = arith.constant 1 : index
 
   //  CHECK-NOT: tensor.cast
-  //      CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
+  //      CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) = (%{{[0-9a-z]*}})
+  // CHECK-SAME:   to (%[[num_threads]]) step (%{{[0-9a-z]*}})
+  // CHECK-SAME:   shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
   // CHECK-NEXT:   scf.foreach_thread.perform_concurrently {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[arg0]] into %[[o]][%[[tidx]], 0] [1, 5] [1, 1]
-  %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+  %2 = scf.foreach_thread (%tidx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
     %3 = tensor.cast %arg0 : tensor<1x5xf32> to tensor<?x5xf32>
     scf.foreach_thread.perform_concurrently {
       tensor.parallel_insert_slice %3 into %o[%tidx, %c0] [%c1, 5] [%c1, %c1] : tensor<?x5xf32> into tensor<?x?xf32>
@@ -1553,10 +1555,10 @@
 {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  //      CHECK: scf.foreach_thread () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
+  //      CHECK: scf.foreach_thread () = () to () step () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
   // CHECK-NEXT:   scf.foreach_thread.perform_concurrently {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[arg0]] into %[[o]][0, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<1x5xf32>
-  %2 = scf.foreach_thread () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
+  %2 = scf.foreach_thread () = () to () step () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
     scf.foreach_thread.perform_concurrently {
       tensor.parallel_insert_slice %arg0 into %o[%c0, %c0] [1, 5] [%c1, %c1] : tensor<1x5xf32> into tensor<1x5xf32>
     }
diff --git a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
--- a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
+++ b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
@@ -29,7 +29,9 @@
 // FOREACH-DAG: %[[c5:.+]] = arith.constant 5 : index
 // FOREACH-DAG: %[[c7:.+]] = arith.constant 7 : index
 // FOREACH-DAG: %[[init:.+]] = tensor.empty() : tensor<20x11xf32>
-//     FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) in (%[[c20]]) shared_outs(%[[dest:.+]] = %[[init]])
+//     FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) =
+//FOREACH-SAME:     to (%[[c20]])
+//FOREACH-SAME:     shared_outs(%[[dest:.+]] = %[[init]])
 //     FOREACH:   %[[multiIndex:.+]]:3 = affine.delinearize_index %[[iv]] into (%[[c3]], %[[c5]], %[[c7]]
 //     FOREACH:   %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex]]#0, %[[multiIndex]]#1, %[[multiIndex]]#2, 0] [1, 1, 1, 11] [1, 1, 1, 1] :
 //     FOREACH:   %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3]{{\]}} :
@@ -137,7 +139,9 @@
 // FOREACH-DAG:     %[[d1:.+]] = tensor.dim %[[arg0]], %[[c1]] :
 // FOREACH-DAG:     %[[d2:.+]] = tensor.dim %[[arg0]], %[[c2]] :
 // FOREACH-DAG:     %[[d4:.+]] = tensor.dim %[[arg0]], %[[c4]] :
-//     FOREACH:     %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
+//     FOREACH:     %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) =
+//FOREACH-SAME:         to (%[[sz1]], %[[sz2]])
+//FOREACH-SAME:         shared_outs(%[[dest:.+]] = %[[init]])
 // FOREACH-DAG:       %[[iv1:.+]] = affine.apply #[[map1]](%[[tid1]])[%[[lb1]]]
 //     FOREACH:       %[[multiIndex1:.+]]:3 = affine.delinearize_index %[[iv1]] into (%[[c3]], %[[d1]], %[[d2]]) :
 // FOREACH-DAG:       %[[iv2:.+]] = affine.apply #[[map1]](%[[tid2]])[%[[lb2]]]
diff --git a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
--- a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
+++ b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
@@ -88,9 +88,10 @@
 //   CHECK-NOT:   tensor.insert_slice
 //       CHECK:   tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<1x2xf32>
 func.func @parallel_insert_slice(%t0: tensor<1x2xf32>, %t1: tensor<f32>, %t2: tensor<1x1xf32>) -> tensor<1x2xf32> {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %r = scf.foreach_thread (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
+  %r = scf.foreach_thread (%arg2, %arg3) = (%c0, %c0) to (%c1, %c2) step (%c1, %c1) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
     %inserted_slice = tensor.insert_slice %t1 into %t2[0, 0] [1, 1] [1, 1] : tensor<f32> into tensor<1x1xf32>
     scf.foreach_thread.perform_concurrently {
       tensor.parallel_insert_slice %inserted_slice into %arg4[%arg2, %arg3] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x2xf32>
diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
--- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
+++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -42,9 +42,11 @@
 func.func @rank_reducing_parallel_insert_of_collapse_shape(
     %t: tensor<?x1x1x5xf32>, %d: tensor<?x?x?x?xf32>, %sz: index, %thr: index)
   -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
   %0 = tensor.collapse_shape %t [[0, 1], [2], [3]]
       : tensor<?x1x1x5xf32> into tensor<?x1x5xf32>
-  %1 = scf.foreach_thread (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+  %1 = scf.foreach_thread (%iv) = (%c0) to (%thr) step (%c1) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
     scf.foreach_thread.perform_concurrently {
       tensor.parallel_insert_slice %0 into %o[0, 0, 0, 0][%sz, 1, 1, 5][1, 1, 1, 1]
           : tensor<?x1x5xf32> into tensor<?x?x?x?xf32>
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -193,11 +193,12 @@
 
 // CHECK-LABEL: func.func @rank_reducing_parallel_insert_slice
 func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tensor<200x100xf32>) {
+  %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
   // CHECK: scf.foreach_thread {{.*}} {
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
+  %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs (%o = %out) -> tensor<200x100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       scf.foreach_thread.perform_concurrently {
         // CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
@@ -335,7 +336,7 @@
 
 // CHECK-LABEL: func @dim_not_reading(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
-func.func @dim_not_reading(%t: tensor<?xf32>, %f: f32, %pos: index) 
+func.func @dim_not_reading(%t: tensor<?xf32>, %f: f32, %pos: index)
     -> (tensor<?xf32>, index)
 {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
--- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
@@ -247,8 +247,14 @@
                                 tensor::ExtractSliceFromCollapseHelper &helper,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    auto foreachOp = rewriter.create<scf::ForeachThreadOp>(
-        loc, /*outputs=*/dest, /*numThreads=*/helper.getIterationSpaceSizes(),
+    unsigned numLoops = helper.getIterationSpaceSizes().size();
+    SmallVector<Value> lbs(numLoops,
+                           rewriter.create<arith::ConstantIndexOp>(loc, 0));
+    SmallVector<Value> steps(numLoops,
+                             rewriter.create<arith::ConstantIndexOp>(loc, 1));
+    auto foreachThreadOp = rewriter.create<scf::ForeachThreadOp>(
+        loc, lbs, /*numThreads=*/helper.getIterationSpaceSizes(), steps,
+        /*outputs=*/dest,
         /*mapping=*/ArrayRef<Attribute>{},
         [&](OpBuilder &nestedBuilder, Location loc, ValueRange regionArgs) {
           unsigned numThreadIdRegionArgs =
@@ -267,7 +273,7 @@
           nestedBuilder.create<tensor::ParallelInsertSliceOp>(
               loc, tile, outputArgs[0], insertParams);
         });
-    rewriter.replaceOp(op, foreachOp->getResult(0));
+    rewriter.replaceOp(op, foreachThreadOp->getResult(0));
     return success();
   }
 };