diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td --- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td +++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td @@ -481,9 +481,12 @@ } ``` }]; - let arguments = (ins Variadic:$num_threads, - Variadic:$outputs, - OptionalAttr:$mapping); + let arguments = (ins + Variadic:$lowerBound, + Variadic:$upperBound, + Variadic:$step, + Variadic:$outputs, + OptionalAttr:$mapping); let results = (outs Variadic:$results); let regions = (region SizedRegion<1>:$region); @@ -496,57 +499,67 @@ let skipDefaultBuilders = 1; let builders = [ // Bodyless builder, outputs must be specified. - OpBuilder<(ins "ValueRange":$outputs, "ValueRange":$num_threads, - "std::optional":$mapping)>, + OpBuilder<(ins "ValueRange":$lbs, "ValueRange":$ubs, "ValueRange":$steps, + "ValueRange":$outputs, "std::optional":$mapping)>, + // Builder that takes a bodyBuilder lambda. - OpBuilder<(ins "ValueRange":$outputs, "ValueRange":$num_threads, - "ArrayRef":$mapping, + OpBuilder<(ins "ValueRange":$lbs, "ValueRange":$ubs, "ValueRange":$steps, + "ValueRange":$outputs, "ArrayRef":$mapping, "function_ref":$bodyBuilder)> ]; let extraClassDeclaration = [{ - int64_t getRank() { return getNumThreads().size(); } + int64_t getRank() { return getLowerBound().size(); } + + /// Number of operands controlling the loop: lbs, ubs, steps + unsigned getNumControlOperands() { return 3 * getRank(); } OpResult getTiedOpResult(OpOperand *opOperand) { - assert(opOperand->getOperandNumber() >= getRank() && "invalid operand"); + assert(opOperand->getOperandNumber() >= getNumControlOperands() && + "invalid operand"); return getOperation()->getOpResult( - opOperand->getOperandNumber() - getRank()); + opOperand->getOperandNumber() - getNumControlOperands()); } /// Return the num_threads operand that is tied to the given thread id /// block argument. OpOperand *getTiedOpOperand(BlockArgument bbArg) { assert(bbArg.getArgNumber() >= getRank() && "invalid bbArg"); - return &getOperation()->getOpOperand(bbArg.getArgNumber()); + return &getOperation()->getOpOperand(bbArg.getArgNumber() + 2 * getRank()); } /// Return the shared_outs operand that is tied to the given OpResult. OpOperand *getTiedOpOperand(OpResult opResult) { assert(opResult.getDefiningOp() == getOperation() && "invalid OpResult"); return &getOperation()->getOpOperand( - opResult.getResultNumber() + getRank()); + opResult.getResultNumber() + getNumControlOperands()); } BlockArgument getTiedBlockArgument(OpOperand *opOperand) { - assert(opOperand->getOperandNumber() >= getRank() && "invalid operand"); - return getBody()->getArgument(opOperand->getOperandNumber()); + assert(opOperand->getOperandNumber() >= getNumControlOperands() + && "invalid operand"); + return getBody()->getArgument( + opOperand->getOperandNumber() - 2 * getRank()); } ArrayRef getOutputBlockArguments() { return getBody()->getArguments().drop_front(getRank()); } - ::mlir::ValueRange getThreadIndices() { + ::mlir::ValueRange getInductionVars() { return getBody()->getArguments().take_front(getRank()); } - ::mlir::Value getThreadIndex(int64_t idx) { - return getThreadIndices()[idx]; + ::mlir::Value getInductionVar(int64_t idx) { + return getInductionVars()[idx]; } ::mlir::Block::BlockArgListType getRegionOutArgs() { return getBody()->getArguments().drop_front(getRank()); } + /// Checks if the lbs are zeros and steps are ones. + bool isNormalized(); + /// Helper to sort `values` according to matching `keys`. /// Take a custom `compare` binary comparator which returns true if the first /// element is smaller than the second (i.e. compatible with std::sort). diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -180,15 +180,24 @@ // transform op does not apply to individual ForeachThreadOp. Location loc = foreachThreadOp->getLoc(); + if (!foreachThreadOp.isNormalized()) + return transformOp.emitSilenceableError() + << "unsupported non-normalized loops"; if (foreachThreadOp.getNumResults() > 0) return transformOp.emitSilenceableError() << "only bufferized scf.foreach_thread lowers to " "gpu.block_id"; - if (foreachThreadOp.getNumThreads().size() > 3) + if (foreachThreadOp.getRank() > 3) return transformOp.emitSilenceableError() << "scf.foreach_thread with rank > 3 does not lower to " "gpu.block_id"; - if (llvm::any_of(foreachThreadOp.getNumThreads(), [](Value v) { + if (llvm::any_of(foreachThreadOp.getLowerBound(), [](Value v) { + return !v.getDefiningOp(); + })) { + return transformOp.emitSilenceableError() + << "unsupported dynamic griddim size"; + } + if (llvm::any_of(foreachThreadOp.getUpperBound(), [](Value v) { return !v.getDefiningOp(); })) { return transformOp.emitSilenceableError() @@ -199,7 +208,7 @@ // Step 1. Complete the blockMapping to a full mapping (with 1s) if necessary. SmallVector numBlocks = - llvm::to_vector(foreachThreadOp.getNumThreads()); + llvm::to_vector(foreachThreadOp.getUpperBound()); // Ensure we have 3 block sizes, one for each id. Value one; for (auto attr : mappingAttributes) { @@ -227,7 +236,7 @@ blockIdGenerator(rewriter, foreachThreadOp, blockOps); IRMapping bvm; for (auto [blockIdx, blockDim] : - llvm::zip(foreachThreadOp.getThreadIndices(), blockMapping)) { + llvm::zip(foreachThreadOp.getInductionVars(), blockMapping)) { bvm.map(blockIdx, blockOps[static_cast( blockDim.cast().getMappingId())]); @@ -243,7 +252,7 @@ sourceBlock.getOperations()); // Step 5. RAUW thread indices to thread ops. - for (Value loopIndex : foreachThreadOp.getThreadIndices()) { + for (Value loopIndex : foreachThreadOp.getInductionVars()) { Value blockIdx = bvm.lookup(loopIndex); rewriter.replaceAllUsesWith(loopIndex, blockIdx); } @@ -381,13 +390,15 @@ return emitDefiniteFailure(foreachThreadOp, message); }; Location loc = foreachThreadOp->getLoc(); + if (!foreachThreadOp.isNormalized()) + return failureHelper("unsupported non-normalized loops"); if (foreachThreadOp.getNumResults() > 0) return failureHelper( "only bufferized scf.foreach_thread lowers to gpu.thread_id"); - if (foreachThreadOp.getNumThreads().size() > 3) + if (foreachThreadOp.getRank() > 3) return failureHelper( "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id"); - if (llvm::any_of(foreachThreadOp.getNumThreads(), [](Value v) { + if (llvm::any_of(foreachThreadOp.getUpperBound(), [](Value v) { return !v.getDefiningOp(); })) { return failureHelper("unsupported dynamic blockdim size"); @@ -400,7 +411,7 @@ // Step 1. Complete the threadMapping to a full mapping (with 1s) if // necessary. SmallVector numThreads = - llvm::to_vector(foreachThreadOp.getNumThreads()); + llvm::to_vector(foreachThreadOp.getUpperBound()); // Ensure we have 3 block sizes, one for each id. Value one; for (auto attr : threadMappingAttributes) { @@ -437,7 +448,7 @@ } IRMapping bvm; for (auto [blockIdx, blockDim] : - llvm::zip(foreachThreadOp.getThreadIndices(), threadMapping)) { + llvm::zip(foreachThreadOp.getInductionVars(), threadMapping)) { bvm.map(blockIdx, threadOpsUpdated[blockDim.cast() .getMappingId()]); @@ -484,7 +495,7 @@ sourceBlock.getOperations()); // Step 6. RAUW thread indices to thread ops. - for (Value loopIndex : foreachThreadOp.getThreadIndices()) { + for (Value loopIndex : foreachThreadOp.getInductionVars()) { Value threadIdx = bvm.lookup(loopIndex); rewriter.replaceAllUsesWith(loopIndex, threadIdx); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -253,7 +253,7 @@ OpBuilder::InsertionGuard g(b); b.setInsertionPointToStart(foreachThreadOp.getBody(0)); - ValueRange threadIds = foreachThreadOp.getThreadIndices(); + ValueRange threadIds = foreachThreadOp.getInductionVars(); SmallVector nonZeroNumThreads = llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) { return !isConstantIntValue(ofr, 0); @@ -359,8 +359,12 @@ // 1. Create the ForeachThreadOp. We don't use the lambda body-builder // version because we require the use of RewriterBase in the body, so we // manually move the insertion point to the body below. + unsigned numLoops = materializedNonZeroNumThreads.size(); + SmallVector lbs(numLoops, b.create(loc, 0)); + SmallVector steps(numLoops, b.create(loc, 1)); scf::ForeachThreadOp foreachThreadOp = b.create( - loc, dest, ValueRange(materializedNonZeroNumThreads), mapping); + loc, lbs, ValueRange(materializedNonZeroNumThreads), steps, dest, + mapping); // 2. Fill out the ForeachThreadOp body. SmallVector tiledOffsets, tiledSizes; @@ -680,9 +684,12 @@ getAsValues(b, loc, nonZeroNumThreads); // 2. Create the ForeachThreadOp with an empty region. + unsigned numLoops = materializedNonZeroNumThreads.size(); + SmallVector lbs(numLoops, b.create(loc, 0)); + SmallVector steps(numLoops, b.create(loc, 1)); scf::ForeachThreadOp foreachThreadOp = b.create( - loc, (*identityTensor)->getResults(), - ValueRange(materializedNonZeroNumThreads), mapping); + loc, lbs, ValueRange(materializedNonZeroNumThreads), steps, + (*identityTensor)->getResults(), mapping); // 3. Calculate the tile offsets and sizes for the subsequent loop that will // be nested under `foreachThreadOp`. @@ -712,7 +719,7 @@ b.getIndexAttr(0)); SmallVector sizes = tiledSizes; sizes[reductionDim] = b.getIndexAttr(1); - outOffsets[reductionDim] = foreachThreadOp.getThreadIndices().front(); + outOffsets[reductionDim] = foreachThreadOp.getInductionVars().front(); // TODO: use SubsetExtractOpInterface once it is available. tiledDpsInitOperands.push_back(b.create( loc, initOperand->get().getType().cast(), @@ -746,7 +753,7 @@ if (failed(maybeTiled)) return b.notifyMatchFailure(op, "failed tileLinalgOpImpl"); - SmallVector ids = foreachThreadOp.getThreadIndices(); + SmallVector ids = foreachThreadOp.getInductionVars(); mapLoopToProcessorIds(cast(maybeTiled->loops.back()), ids, materializedNonZeroNumThreads); assert(maybeTiled->loops.size() == 1 && @@ -774,7 +781,7 @@ int64_t sizeIdx = 0; for (int64_t i = 0, e = numThreads.size(); i < e; ++i) { if (i == reductionDim) { - resultOffsetsRank.push_back(foreachThreadOp.getThreadIndices().front()); + resultOffsetsRank.push_back(foreachThreadOp.getInductionVars().front()); resultSizesRank.push_back(b.getIndexAttr(1)); continue; } diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -1142,11 +1142,9 @@ } void ForeachThreadOp::print(OpAsmPrinter &p) { - p << " ("; - llvm::interleaveComma(getThreadIndices(), p); - p << ") in ("; - llvm::interleaveComma(getNumThreads(), p); - p << ")"; + p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to (" + << getUpperBound() << ") step (" << getStep() << ") "; + printInitializationList(p, getRegionOutArgs(), getOutputs(), " shared_outs"); p << " "; if (!getRegionOutArgs().empty()) @@ -1164,17 +1162,30 @@ // Parse an opening `(` followed by thread index variables followed by `)` // TODO: when we can refer to such "induction variable"-like handles from the // declarative assembly format, we can implement the parser as a custom hook. - SmallVector threadIndices; - if (parser.parseArgumentList(threadIndices, OpAsmParser::Delimiter::Paren)) + SmallVector ivs; + if (parser.parseArgumentList(ivs, OpAsmParser::Delimiter::Paren)) + return failure(); + + // Parse lower bounds. + SmallVector lbs; + if (parser.parseEqual() || + parser.parseOperandList(lbs, ivs.size(), OpAsmParser::Delimiter::Paren) || + parser.resolveOperands(lbs, builder.getIndexType(), result.operands)) return failure(); - // Parse `in` threadNums. - SmallVector threadNums; - if (parser.parseKeyword("in") || - parser.parseOperandList(threadNums, threadIndices.size(), + // Parse upper bounds. + SmallVector ubs; + if (parser.parseKeyword("to") || + parser.parseOperandList(ubs, ivs.size(), OpAsmParser::Delimiter::Paren) || + parser.resolveOperands(ubs, builder.getIndexType(), result.operands)) + return failure(); + + // Parse step values. + SmallVector steps; + if (parser.parseKeyword("step") || + parser.parseOperandList(steps, ivs.size(), OpAsmParser::Delimiter::Paren) || - parser.resolveOperands(threadNums, builder.getIndexType(), - result.operands)) + parser.resolveOperands(steps, builder.getIndexType(), result.operands)) return failure(); // Parse out operands and results. @@ -1195,9 +1206,9 @@ // Parse region. SmallVector regionArgs; std::unique_ptr region = std::make_unique(); - for (auto &idx : threadIndices) { - idx.type = builder.getIndexType(); - regionArgs.push_back(idx); + for (auto &iv : ivs) { + iv.type = builder.getIndexType(); + regionArgs.push_back(iv); } for (const auto &it : llvm::enumerate(regionOutArgs)) { auto &out = it.value(); @@ -1215,19 +1226,24 @@ // Parse the optional attribute list. if (parser.parseOptionalAttrDict(result.attributes)) return failure(); - result.addAttribute("operand_segment_sizes", - parser.getBuilder().getDenseI32ArrayAttr( - {static_cast(threadNums.size()), - static_cast(outOperands.size())})); + result.addAttribute( + "operand_segment_sizes", + parser.getBuilder().getDenseI32ArrayAttr( + {static_cast(lbs.size()), static_cast(ubs.size()), + static_cast(steps.size()), + static_cast(outOperands.size())})); return success(); } // Bodyless builder, outputs must be specified. void ForeachThreadOp::build(mlir::OpBuilder &builder, - mlir::OperationState &result, ValueRange outputs, - ValueRange numThreads, + mlir::OperationState &result, ValueRange lbs, + ValueRange ubs, ValueRange steps, + ValueRange outputs, std::optional mapping) { - result.addOperands(numThreads); + result.addOperands(lbs); + result.addOperands(ubs); + result.addOperands(steps); result.addOperands(outputs); if (mapping.has_value()) { result.addAttribute(ForeachThreadOp::getMappingAttrName(result.name), @@ -1236,7 +1252,9 @@ result.addAttribute( "operand_segment_sizes", - builder.getDenseI32ArrayAttr({static_cast(numThreads.size()), + builder.getDenseI32ArrayAttr({static_cast(lbs.size()), + static_cast(ubs.size()), + static_cast(steps.size()), static_cast(outputs.size())})); result.addTypes(TypeRange(outputs)); @@ -1248,9 +1266,8 @@ builder.createBlock(bodyRegion); Block &bodyBlock = bodyRegion->front(); // Add block arguments for indices and outputs. - bodyBlock.addArguments( - SmallVector(numThreads.size(), builder.getIndexType()), - SmallVector(numThreads.size(), result.location)); + bodyBlock.addArguments(SmallVector(lbs.size(), builder.getIndexType()), + SmallVector(lbs.size(), result.location)); bodyBlock.addArguments( TypeRange(outputs), SmallVector(outputs.size(), result.location)); @@ -1259,16 +1276,21 @@ // Builder that takes a bodyBuilder lambda. void ForeachThreadOp::build( - mlir::OpBuilder &builder, mlir::OperationState &result, ValueRange outputs, - ValueRange numThreads, ArrayRef mapping, + mlir::OpBuilder &builder, mlir::OperationState &result, ValueRange lbs, + ValueRange ubs, ValueRange steps, ValueRange outputs, + ArrayRef mapping, function_ref bodyBuilder) { - result.addOperands(numThreads); + result.addOperands(lbs); + result.addOperands(ubs); + result.addOperands(steps); result.addOperands(outputs); result.addAttribute(ForeachThreadOp::getMappingAttrName(result.name), builder.getArrayAttr(mapping)); result.addAttribute( "operand_segment_sizes", - builder.getDenseI32ArrayAttr({static_cast(numThreads.size()), + builder.getDenseI32ArrayAttr({static_cast(lbs.size()), + static_cast(ubs.size()), + static_cast(steps.size()), static_cast(outputs.size())})); result.addTypes(TypeRange(outputs)); @@ -1277,9 +1299,8 @@ builder.createBlock(bodyRegion); Block &bodyBlock = bodyRegion->front(); // Add block arguments for indices and outputs. - bodyBlock.addArguments( - SmallVector(numThreads.size(), builder.getIndexType()), - SmallVector(numThreads.size(), result.location)); + bodyBlock.addArguments(SmallVector(lbs.size(), builder.getIndexType()), + SmallVector(lbs.size(), result.location)); bodyBlock.addArguments( TypeRange(outputs), SmallVector(outputs.size(), result.location)); @@ -1294,6 +1315,15 @@ #endif // NDEBUG } +// Checks if the lbs are zeros and steps are ones. +bool ForeachThreadOp::isNormalized() { + return llvm::all_of( + getLowerBound(), + [](Value v) { return matchPattern(v, mlir::m_Zero()); }) && + llvm::all_of(getStep(), + [](Value v) { return matchPattern(v, m_One()); }); +} + // The ensureTerminator method generated by SingleBlockImplicitTerminator is // unaware of the fact that our terminator also needs a region to be // well-formed. We override it here to ensure that we do the right thing. diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp @@ -1004,15 +1004,15 @@ /// Return `true` if the given loop may have 0 iterations. bool mayHaveZeroIterations(scf::ForeachThreadOp foreachThreadOp) { - int64_t p = 1; - for (Value v : foreachThreadOp.getNumThreads()) { - if (std::optional c = getConstantIntValue(v)) { - p *= *c; - } else { + for (auto [lb, ub] : llvm::zip(foreachThreadOp.getLowerBound(), + foreachThreadOp.getUpperBound())) { + std::optional lbConst = getConstantIntValue(lb); + std::optional ubConst = getConstantIntValue(ub); + if (!lbConst.has_value() || !ubConst.has_value() || *lbConst >= *ubConst) { return true; } } - return p == 0; + return false; } /// Bufferization of ForeachThreadOp. This also bufferizes the terminator of the @@ -1087,8 +1087,9 @@ rewriter.setInsertionPoint(foreachThreadOp); ForeachThreadOp newForeachThreadOp; newForeachThreadOp = rewriter.create( - foreachThreadOp.getLoc(), /*outputs=*/ValueRange(), - foreachThreadOp.getNumThreads(), foreachThreadOp.getMapping()); + foreachThreadOp.getLoc(), foreachThreadOp.getLowerBound(), + foreachThreadOp.getUpperBound(), foreachThreadOp.getStep(), + /*outputs=*/ValueRange(), foreachThreadOp.getMapping()); newForeachThreadOp.getBody()->getTerminator()->erase(); @@ -1127,10 +1128,27 @@ bool isRepetitiveRegion(Operation *op, unsigned index) const { auto foreachThreadOp = cast(op); - // This op is not repetitive if it has just a single thread. - return !llvm::all_of(foreachThreadOp.getNumThreads(), [](Value v) { - return getConstantIntValue(v) == static_cast(1); - }); + + // This op is not repetitive if it has just a single step. + for (auto [lb, ub, step] : llvm::zip(foreachThreadOp.getLowerBound(), + foreachThreadOp.getUpperBound(), + foreachThreadOp.getStep())) { + std::optional lbConstant = getConstantIntValue(lb); + if (!lbConstant) + return true; + + std::optional ubConstant = getConstantIntValue(ub); + if (!ubConstant) + return true; + + std::optional stepConstant = getConstantIntValue(step); + if (!stepConstant) + return true; + + if (*lbConstant + *stepConstant < *ubConstant) + return true; + } + return false; } }; diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp --- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp @@ -180,10 +180,10 @@ if (scf::ForeachThreadOp foreachThreadOp = scf::getForeachThreadOpThreadIndexOwner(iv)) { for (int64_t idx = 0; idx < foreachThreadOp.getRank(); ++idx) { - if (foreachThreadOp.getThreadIndices()[idx] == iv) { - lb = OpBuilder(iv.getContext()).getIndexAttr(0); - ub = foreachThreadOp.getNumThreads()[idx]; - step = OpBuilder(iv.getContext()).getIndexAttr(1); + if (foreachThreadOp.getInductionVar(idx) == iv) { + lb = foreachThreadOp.getLowerBound()[idx]; + ub = foreachThreadOp.getUpperBound()[idx]; + step = foreachThreadOp.getStep()[idx]; return success(); } } diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir @@ -144,14 +144,14 @@ // CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref // CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index func.func @parallel_insert_slice( - %t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true}, - %sz: index) - -> (tensor) -{ + %t: tensor {bufferization.buffer_layout = affine_map<(d0) -> (d0)>, bufferization.writable = true}, + %sz: index) -> (tensor) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %f0 = arith.constant 0.0: f32 %c512 = arith.constant 512 : index - %r1 = scf.foreach_thread (%iv) in (%c512) shared_outs(%o = %t) -> (tensor) { + %r1 = scf.foreach_thread (%iv) = (%c0) to (%c512) step (%c1) shared_outs(%o = %t) -> (tensor) { // tensor.empty itself does not alloc but forwards to the insert_slice. // EmptyTensorOpElimination replaces the tensor.empty with an inplace // extract_slice. diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir --- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir @@ -14,14 +14,15 @@ // ----- func.func @map_nested_foreach_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c900 = arith.constant 900 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c900) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -30,10 +31,10 @@ gpu.terminator } - %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -55,14 +56,15 @@ // ----- func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c900 = arith.constant 900 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c900) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -71,10 +73,10 @@ gpu.terminator } - %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -96,12 +98,13 @@ // ----- func.func @map_nested_foreach_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c900 = arith.constant 900 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c900) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -122,9 +125,9 @@ // ----- func.func @map_nested_foreach_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) { - %one = arith.constant 1 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %c1 = arith.constant 1 : index + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32> gpu.terminator @@ -159,22 +162,23 @@ // ----- func.func @map_foreach_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c900 = arith.constant 900 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index // expected-note @below {{when applied to this payload op}} - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c900) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c900) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> } { mapping = [#gpu.thread, #gpu.thread] } - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -197,19 +201,20 @@ // expected-note @below {{when applied to this payload op}} func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c65537 = arith.constant 65536 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index - scf.foreach_thread (%i, %j) in (%c7, %c65537) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c65537) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> } { mapping = [#gpu.thread, #gpu.thread] } - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -229,9 +234,10 @@ // ----- func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c65535 = arith.constant 65535 : index - scf.foreach_thread (%i, %j) in (%c65535, %c65535) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c65535, %c65535) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 @@ -251,12 +257,13 @@ !type = memref<32x32xf32> func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index - %one = arith.constant 1 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c32, %c32) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : !type %5 = memref.load %y[%i, %j] : !type %6 = arith.mulf %4, %5 : f32 @@ -277,9 +284,9 @@ // ----- func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32>, %stream : !gpu.async.token) { - %one = arith.constant 1 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %c1 = arith.constant 1 : index + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { // expected-error @below {{'linalg.generic' op must have "tensor semantic" for tiling}} // expected-note @below {{when applied to this op}} diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -10,6 +10,7 @@ func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index + %zero = arith.constant 0 : index %one = arith.constant 1 : index // CHECK: gpu.launch // CHECK: %[[BLKX:.*]] = gpu.block_id x @@ -19,7 +20,7 @@ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c7, %c9) step (%one, %one) { %4 = memref.load %x[%i, %j] : !type %5 = memref.load %y[%i, %j] : !type %6 = math.fma %alpha, %4, %5 : f32 @@ -46,6 +47,7 @@ // CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> // CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %zero = arith.constant 0 : index %one = arith.constant 1 : index %c12 = arith.constant 12 : index %c9 = arith.constant 9 : index @@ -68,13 +70,13 @@ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c7, %c9) step (%one, %one) { %4 = memref.load %x[%i, %j] : !type %5 = memref.load %y[%i, %j] : !type %6 = math.fma %alpha, %4, %5 : f32 memref.store %6, %y[%i, %j] : !type } { mapping = [#gpu.thread, #gpu.thread]} - scf.foreach_thread (%i) in (%c12) { + scf.foreach_thread (%i) = (%zero) to (%c12) step (%one) { %7 = memref.load %t[%i] : !type1d %8 = arith.addf %alpha, %7 : f32 memref.store %8, %t[%i] : !type1d @@ -98,22 +100,24 @@ // CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32> // CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32> func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index %c32 = arith.constant 32 : index %c64 = arith.constant 64 : index - %c4 = arith.constant 4 : index -// CHECK: %[[C32:.*]] = arith.constant 32 : index -// CHECK: %[[C64:.*]] = arith.constant 64 : index -// CHECK: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]]) -// CHECK: %[[BLKX:.*]] = gpu.block_id x -// CHECK: %[[BLKY:.*]] = gpu.block_id y -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[TIDY:.*]] = gpu.thread_id y -// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] -// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] - scf.foreach_thread (%i, %j) in (%c32, %c64) { - scf.foreach_thread (%k, %l) in (%c4, %c32) { +// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index +// CHECK-DAG: %[[C64:.*]] = arith.constant 64 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]]) +// CHECK: %[[BLKX:.*]] = gpu.block_id x +// CHECK: %[[BLKY:.*]] = gpu.block_id y +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] +// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c32, %c64) step (%c1, %c1) { + scf.foreach_thread (%k, %l) = (%c0, %c0) to (%c4, %c32) step (%c1, %c1) { %4 = memref.load %x[%i, %j, %k, %l] : !type4d %5 = memref.load %y[%i, %j, %k, %l] : !type4d %6 = math.fma %alpha, %4, %5 : f32 @@ -137,6 +141,7 @@ // CHECK-LABEL: func.func @saxpy2d_no_barrier( func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { + %zero = arith.constant 0 : index %one = arith.constant 1 : index %c12 = arith.constant 12 : index %c9 = arith.constant 9 : index @@ -146,7 +151,7 @@ %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c7, %c9) step (%one, %one) { %4 = memref.load %x[%i, %j] : !type %5 = memref.load %y[%i, %j] : !type %6 = math.fma %alpha, %4, %5 : f32 @@ -170,15 +175,16 @@ // CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x32xf32> // CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x32xf32> func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index - %one = arith.constant 1 : index - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { // CHECK: %[[TIDX:.*]] = gpu.thread_id x // CHECK: memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]] // CHECK: memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]] - scf.foreach_thread (%i) in (%c32) { + scf.foreach_thread (%i) = (%c0) to (%c32) step (%c1) { %4 = memref.load %x[%i, %i] : !type %5 = memref.load %y[%i, %i] : !type %6 = arith.mulf %4, %5 : f32 @@ -202,16 +208,17 @@ // CHECK-LABEL: func.func @saxpy3d_fold_id_z( func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c12 = arith.constant 12 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK-NOT: gpu.thread_id z - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j, %k) in (%one, %c7, %c9) { + scf.foreach_thread (%i, %j, %k) = (%c0, %c0, %c0) to (%c1, %c7, %c9) step (%c1, %c1, %c1) { // CHECK: memref.load %{{.*}}[%[[C0]], // CHECK: memref.load %{{.*}}[%[[C0]], %4 = memref.load %x[%i, %j, %k] : !type @@ -238,23 +245,24 @@ // CHECK-LABEL: func.func @map_multi_level( func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { - %one = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c12 = arith.constant 12 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index // check that the thread level got distributed but not the warp level. // CHECK-NOT: {mapping = #gpu.thread // CHECK: {mapping = [#gpu.warp]} - %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) - threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { - scf.foreach_thread (%i, %j) in (%c7, %c9) { + scf.foreach_thread (%i, %j) = (%c0, %c0) to (%c7, %c9) step (%c1, %c1) { %4 = memref.load %x[%i, %j] : !type %5 = memref.load %y[%i, %j] : !type %6 = math.fma %alpha, %4, %5 : f32 memref.store %6, %y[%i, %j] : !type } { mapping = [#gpu.thread, #gpu.thread]} - scf.foreach_thread (%i) in (%c12) { + scf.foreach_thread (%i) = (%c0) to (%c12) step (%c1) { %7 = memref.load %t[%i] : !type1d %8 = arith.addf %alpha, %7 : f32 memref.store %8, %t[%i] : !type1d diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir --- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir +++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir @@ -876,11 +876,13 @@ // ----- func.func @reduce_dispatch_0() -> tensor<4x2xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<4x2xf32> - %res = scf.foreach_thread (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) { + %res = scf.foreach_thread (%arg0, %arg1) = (%c0, %c0) to (%c4, %c2) step (%c1, %c1) shared_outs(%o = %0) -> (tensor<4x2xf32>) { %1 = tensor.empty() : tensor<1x1xf32> %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32> scf.foreach_thread.perform_concurrently { diff --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir --- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir @@ -15,7 +15,9 @@ func.func @matmul(%A: tensor, %B: tensor, %C: tensor) -> tensor { // CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index // CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index - // CHECK: scf.foreach_thread ({{.*}}) in (%[[C10]], %[[C20]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor) { + // CHECK: scf.foreach_thread + // CHECK-SAME: to (%[[C10]], %[[C20]]) + // CHECK-SAME: shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor) { // CHECK: %[[tA:.*]] = tensor.extract_slice %[[A]]{{.*}} : tensor to tensor // CHECK: %[[tB:.*]] = tensor.extract_slice %[[B]]{{.*}} : tensor to tensor // CHECK: %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor to tensor @@ -60,7 +62,9 @@ // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 : // CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size_1]]] // CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map0]]()[%[[N]], %[[tile_size_2]]] - // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) + // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) = + // CHECK-SAME: to (%[[NT0]], %[[NT1]]) + // CHECK-SAME: shared_outs(%[[C_BLK:.*]] = %[[C]]) // CHECK: tensor.extract_slice %[[A]] // CHECK: tensor.extract_slice %[[B]] // CHECK: tensor.extract_slice %[[C_BLK]] @@ -97,7 +101,9 @@ func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { // CHECK-DAG: %[[c10:.+]] = arith.constant 10 : index // CHECK-DAG: %[[c21:.+]] = arith.constant 21 : index - // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[c10]], %[[c21]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) + // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) = + // CHECK-SAME: to (%[[c10]], %[[c21]]) + // CHECK-SAME: shared_outs(%[[C_BLK:.*]] = %[[C]]) // CHECK: %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV1]]) // CHECK: %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]]) // CHECK-NOT: affine.min @@ -140,7 +146,9 @@ // CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 : // CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]] // CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] - // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) + // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) = + // CHECK-SAME: to (%[[NT0]], %[[NT1]]) + // CHECK-SAME: shared_outs(%[[C_BLK:.*]] = %[[C]]) // CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] // CHECK: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) @@ -177,7 +185,9 @@ func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { // CHECK-DAG: %[[c10:.+]] = arith.constant 10 : // CHECK-DAG: %[[c15:.+]] = arith.constant 15 : - // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[c10]], %[[c15]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) + // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) = + // CHECK-SAME: to (%[[c10]], %[[c15]]) + // CHECK-SAME: shared_outs(%[[C_BLK:.*]] = %[[C]]) // CHECK: %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]]) // CHECK-NOT: affine.max // CHECK-NOT: affine.min @@ -226,7 +236,9 @@ // CHECK-LABEL: extract_source( // CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: scf.foreach_thread (%[[ARG:.*]]) in (%[[C2]]) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) { +// CHECK: scf.foreach_thread (%[[ARG:.*]]) = +// CHECK-SAME: to (%[[C2]]) +// CHECK-SAME: shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) { // CHECK: %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]]) // CHECK: scf.foreach_thread.perform_concurrently { // CHECK: tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32> @@ -254,7 +266,9 @@ // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 : // CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size]]] // CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]] - // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) + // CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) = + // CHECK-SAME: to (%[[NT0]], %[[NT1]]) + // CHECK-SAME: shared_outs(%[[C_BLK:.*]] = %[[C]]) // CHECK: tensor.extract_slice %[[A]] // CHECK: tensor.extract_slice %[[B]] // CHECK: tensor.extract_slice %[[C_BLK]] @@ -290,7 +304,9 @@ %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>) -> (tensor<100xf32>, tensor<100xf32>) { // CHECK-DAG: %[[c0:.+]] = arith.constant 7 : -// CHECK: scf.foreach_thread (%[[IV0:.+]]) in (%[[c0]]) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) +// CHECK: scf.foreach_thread (%[[IV0:.+]]) = +// CHECK-SAME: to (%[[c0]]) +// CHECK-SAME: shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) // CHECK: %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]]) // CHECK: %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]]) // CHECK-NOT: affine.min @@ -346,7 +362,9 @@ %OUT1: tensor<300x100xf32>, %OUT2: tensor<300xf32>) -> (tensor<300x100xf32>, tensor<300xf32>) { // CHECK-DAG: %[[c0:.+]] = arith.constant 4 : -// CHECK: scf.foreach_thread (%[[IV0:.+]]) in (%[[c0]]) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) +// CHECK: scf.foreach_thread (%[[IV0:.+]]) = +// CHECK-SAME: to (%[[c0]]) +// CHECK-SAME: shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) // CHECK: %[[LB:.+]] = affine.apply #[[$map0]](%[[IV0]]) // CHECK: %[[tIN1:.+]] = tensor.extract_slice %[[IN2]][0, %[[LB]]] [100, 75] // CHECK: %[[tIN2:.+]] = tensor.extract_slice %[[IN3]][%[[LB]]] [75] diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir --- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir @@ -12,12 +12,13 @@ func.func @fuse_tileable_op(%arg0: index, %arg1: tensor, %arg2: tensor) -> tensor { %cst = arith.constant 4.200000e+01 : f32 %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor) -> tensor %d0 = tensor.dim %arg1, %c0 : tensor %1 = affine.apply #map0()[%d0, %arg0] // CHECK: scf.foreach_thread {{.*}} { - %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor) { + %2 = scf.foreach_thread (%arg3) = (%c0) to (%1) step (%c1) shared_outs(%o = %arg2) -> (tensor) { %3 = affine.apply #map1(%arg3)[%arg0] %4 = affine.min #map2(%arg3)[%d0, %arg0] %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor to tensor @@ -63,11 +64,13 @@ // CHECK-SAME: %[[IN:[0-9a-z]+]]: tensor<64xf32> // CHECK-SAME: %[[OUT:[0-9a-z]+]]: tensor<64xf32> func.func @fuse_untileable_op(%arg0: index, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<64xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %0 = tensor.empty(%arg0) : tensor %1 = affine.apply #map0()[%arg0] // CHECK: scf.foreach_thread {{.*}} { - %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<64xf32>) { + %2 = scf.foreach_thread (%arg3) = (%c0) to (%1) step (%c1) shared_outs(%o = %arg2) -> (tensor<64xf32>) { // CHECK: %[[INIT_TENSOR:.*]] = tensor.empty %3 = affine.apply #map1(%arg3)[%arg0] %4 = affine.min #map2(%arg3)[%arg0] @@ -108,13 +111,14 @@ func.func @fuse_tileable_op_rank_reducing(%arg0: index, %arg1: tensor, %arg2: tensor) -> tensor { %cst = arith.constant 4.200000e+01 : f32 %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor) -> tensor %d0 = tensor.dim %arg1, %c0 : tensor // CHECK: scf.foreach_thread {{.*}} -> (tensor) { - %2 = scf.foreach_thread (%arg3) in (%d0) shared_outs(%o = %0) -> (tensor) { + %2 = scf.foreach_thread (%arg3) = (%c0) to (%d0) step (%c1) shared_outs(%o = %0) -> (tensor) { %5 = tensor.extract_slice %o[%arg3] [1] [1] : tensor to tensor - + // CHECK: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor to tensor<1xf32> // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : tensor<1xf32>) -> tensor<1xf32> // CHECK: tensor.extract_slice %{{.*}}[0] [1] [1] : tensor<1xf32> to tensor @@ -154,12 +158,14 @@ func.func @fuse_tileable_op_through_bbarg(%arg0: index, %arg1: tensor, %arg2: tensor) -> tensor { %cst = arith.constant 4.200000e+01 : f32 %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor) -> tensor %d0 = tensor.dim %arg1, %c0 : tensor %1 = affine.apply #map0()[%d0, %arg0] - // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor) { - %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %0) -> (tensor) { + // CHECK: scf.foreach_thread + // CHECK-SAME: shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor) { + %2 = scf.foreach_thread (%arg3) = (%c0) to (%1) step (%c1) shared_outs(%o = %0) -> (tensor) { %3 = affine.apply #map1(%arg3)[%arg0] %4 = affine.min #map2(%arg3)[%d0, %arg0] %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor to tensor @@ -204,6 +210,7 @@ func.func @fuse_tileable_multi_output_op(%idx: index, %in: tensor, %out_1: tensor, %out_2: tensor, %out_3: tensor) -> tensor { %cst = arith.constant 4.200000e+01 : f32 %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %0:2 = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], @@ -219,7 +226,7 @@ %1 = affine.apply #map0()[%d0, %idx] // CHECK: scf.foreach_thread {{.*}} { - %2 = scf.foreach_thread (%i) in (%1) shared_outs(%o = %out_2) -> (tensor) { + %2 = scf.foreach_thread (%i) = (%c0) to (%1) step (%c1) shared_outs(%o = %out_2) -> (tensor) { %3 = affine.apply #map1(%i)[%idx] %4 = affine.min #map2(%i)[%d0, %idx] %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor to tensor diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir --- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir @@ -128,7 +128,9 @@ // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor -// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { +// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) = +//CHECK-SAME: to (%[[C5]]) +//CHECK-SAME: shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { // CHECK-DAG: %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]] // CHECK-DAG: %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]]) // CHECK-DAG: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor to tensor @@ -183,7 +185,9 @@ // CHECK-DAG: %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor // CHECK: %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor -// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { +// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) = +//CHECK-SAME: to (%[[C5]]) +//CHECK-SAME: shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { // CHECK-DAG: %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]] // CHECK-DAG: %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]]) // CHECK-DAG: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor to tensor @@ -241,7 +245,9 @@ // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor // CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor // CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor) -> tensor -// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (%[[C5]]) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { +// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) = +//CHECK-SAME: to (%[[C5]]) +//CHECK-SAME: shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor) { // CHECK: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor to tensor // CHECK: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor // CHECK: %[[LB:.+]] = affine.apply #[[MAP0]]()[%[[IV]]] diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -1484,9 +1484,10 @@ %arg0 : tensor<1x5xf32>, %arg1: tensor, %num_threads : index) -> index { // CHECK: %[[c1:.*]] = arith.constant 1 : index + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor) { + %2 = scf.foreach_thread (%tidx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %arg1) -> (tensor) { scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %arg0 into %o[%tidx, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor } diff --git a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir --- a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir +++ b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir @@ -2,13 +2,15 @@ func.func @reduce() { // CHECK: %[[C64:.*]] = arith.constant 64 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %cst_0 = arith.constant -0.000000e+00 : f32 %0 = memref.alloc() : memref<128x384xf32> linalg.fill ins(%cst_0 : f32) outs(%0 : memref<128x384xf32>) %2 = memref.alloc() : memref<128xf32> linalg.fill ins(%cst_0 : f32) outs(%2 : memref<128xf32>) - scf.foreach_thread (%arg0) in (%c2) { + scf.foreach_thread (%arg0) = (%c0) to (%c2) step (%c1) { %7 = affine.min affine_map<(d0) -> (d0 * -64 + 128, 64)>(%arg0) %8 = affine.max affine_map<(d0) -> (0, d0)>(%7) %9 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir --- a/mlir/test/Dialect/SCF/invalid.mlir +++ b/mlir/test/Dialect/SCF/invalid.mlir @@ -544,11 +544,12 @@ // ----- func.func @wrong_num_results(%in: tensor<100xf32>, %out: tensor<100xf32>) { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index // expected-error @+1 {{1 operands present, but expected 2}} - %result:2 = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) { + %result:2 = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) { %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32> scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] : @@ -561,10 +562,11 @@ // ----- func.func @invalid_insert_dest(%in: tensor<100xf32>, %out: tensor<100xf32>) { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index - %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) { + %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> (tensor<100xf32>) { %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32> scf.foreach_thread.perform_concurrently { // expected-error @+1 {{may only insert into an output block argument}} @@ -578,10 +580,11 @@ // ----- func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index - %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) { + %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> (tensor<100xf32>) { %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32> // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}} scf.foreach_thread.perform_concurrently { @@ -596,10 +599,11 @@ // ----- func.func @mismatched_mapping(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { + %zero = arith.constant 0 : index %one = arith.constant 1 : index %c65535 = arith.constant 65535 : index // expected-error @below {{'scf.foreach_thread' op mapping attribute size must match op rank}} - scf.foreach_thread (%i, %j) in (%c65535, %c65535) { + scf.foreach_thread (%i, %j) = (%zero, %zero) to (%c65535, %c65535) step (%one, %one) { %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> %6 = math.fma %alpha, %4, %5 : f32 diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir --- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir @@ -612,9 +612,10 @@ -> tensor<320xf32> { %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %cst = arith.constant -0.000000e+00 : f32 %c320 = arith.constant 320 : index - %4 = scf.foreach_thread (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) { + %4 = scf.foreach_thread (%arg0) = (%c0) to (%c320) step (%c1) shared_outs(%arg1 = %2) -> (tensor<320xf32>) { // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]} %5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1] : tensor<320x10240xf32> to tensor<1x10240xf32> // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]} diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir --- a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir @@ -108,18 +108,19 @@ // ----- -// CHECK-LABEL: func @scf_foreach_thread_out_of_place( +// CHECK-LABEL: func @scf.foreach_thread_out_of_place( // CHECK-SAME: %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32> -// CHECK-FUNC-LABEL: func @scf_foreach_thread_out_of_place( -func.func @scf_foreach_thread_out_of_place(%in: tensor<100xf32>, +// CHECK-FUNC-LABEL: func @scf.foreach_thread_out_of_place( +func.func @scf.foreach_thread_out_of_place(%in: tensor<100xf32>, %out: tensor<100xf32>) { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index // CHECK-FUNC-NOT: alloc_tensor // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() copy(%[[arg1]]) {bufferization.escape = [false]} : tensor<100xf32> // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[o:.*]] = %[[alloc]]) - %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> { + %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> { // CHECK: tensor.extract_slice // CHECK: scf.foreach_thread.perform_concurrently // CHECK: tensor.parallel_insert_slice %{{.*}} into %[[o]] diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir --- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir @@ -543,8 +543,8 @@ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]]) - %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor) { + // CHECK: scf.foreach_thread (%[[tidx:.*]]) to (%[[idx2]]) + %2 = scf.foreach_thread (%arg3) = (%c0) to (%idx2) step (%c1) shared_outs(%o = %arg2) -> (tensor) { // CHECK: %[[subview:.*]] = memref.subview %[[arg2]][5] [%[[idx]]] [1] %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor to tensor // CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview]] : memref (tensor) { + // CHECK: scf.foreach_thread (%[[tidx:.*]]) to (%[[idx2]]) + %2 = scf.foreach_thread (%arg3) = (%c0) to (%idx2) step (%c1) shared_outs(%o = %arg2) -> (tensor) { // CHECK: %[[subview1:.*]] = memref.subview %[[alloc1]][5] [%[[idx]]] [1] %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor to tensor @@ -625,12 +625,15 @@ #map1 = affine_map<(d0) -> (d0 * 2)> // CHECK-LABEL: func.func @matmul -func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<8x8xf32> {bufferization.writable = true}) -> tensor<8x8xf32> { +func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, + %arg2: tensor<8x8xf32> {bufferization.writable = true}) -> tensor<8x8xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index - // CHECK: scf.foreach_thread {{.*}} - %0 = scf.foreach_thread (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) { + // CHECK: scf.foreach_thread + %0 = scf.foreach_thread (%arg3, %arg4) = (%c0, %c0) to (%c2, %c4) step (%c1, %c1) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) { %1 = affine.apply #map0(%arg3) %3 = tensor.extract_slice %arg0[%1, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32> %4 = affine.apply #map1(%arg4) @@ -651,6 +654,8 @@ // CHECK-LABEL: func @scf_foreach_private_var( // CHECK-SAME: %[[t:.*]]: memref<10xf32 func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c5 = arith.constant 5 : index @@ -658,13 +663,13 @@ // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32> // CHECK: memref.copy %[[t]], %[[t_copy]] - // CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) { + // CHECK: scf.foreach_thread // Load from the copy and store into the shared output. // CHECK: %[[subview:.*]] = memref.subview %[[t]] // CHECK: memref.load %[[t_copy]] // CHECK: memref.store %{{.*}}, %[[subview]] - %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> { + %0 = scf.foreach_thread (%tid) = (%c0) to (%c2) step (%c1) shared_outs(%o = %t) -> tensor<10xf32> { %offset = arith.muli %c5, %tid : index %slice = tensor.extract_slice %o[%offset] [5] [1] : tensor<10xf32> to tensor<5xf32> @@ -686,13 +691,15 @@ // CHECK-SAME: %[[t0:.*]]: memref<10xf32, {{.*}}>, %[[t1:.*]]: memref<10xf32 func.func @scf_foreach_privatized_but_not_copied( %t0: tensor<10xf32>, %t1: tensor<10xf32>) -> f32 { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c5 = arith.constant 5 : index // CHECK-NOT: memref.alloc // CHECK-NOT: memref.copy - // CHECK: scf.foreach_thread {{.*}} { - %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> { + // CHECK: scf.foreach_thread + %0 = scf.foreach_thread (%tid) = (%c0) to (%c2) step (%c1) shared_outs(%o = %t0) -> tensor<10xf32> { %offset = arith.muli %c5, %tid : index %slice = tensor.extract_slice %o[%offset] [5] [1] : tensor<10xf32> to tensor<5xf32> diff --git a/mlir/test/Dialect/SCF/ops.mlir b/mlir/test/Dialect/SCF/ops.mlir --- a/mlir/test/Dialect/SCF/ops.mlir +++ b/mlir/test/Dialect/SCF/ops.mlir @@ -313,6 +313,7 @@ // CHECK-LABEL: func.func @simple_example func.func @simple_example(%in: tensor<100xf32>, %out: tensor<100xf32>) { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index @@ -323,7 +324,7 @@ // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: return - %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> { + %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> { %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32> scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] : @@ -335,12 +336,14 @@ // CHECK-LABEL: func.func @elide_terminator func.func @elide_terminator() -> () { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index // CHECK: scf.foreach_thread // CHECK-NEXT: } {mapping = [#gpu.thread]} // CHECK-NEXT: return - scf.foreach_thread (%thread_idx) in (%num_threads) { + scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) { scf.foreach_thread.perform_concurrently { } } {mapping = [#gpu.thread]} diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -1531,10 +1531,12 @@ %c1 = arith.constant 1 : index // CHECK-NOT: tensor.cast - // CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor) { + // CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) = (%{{[0-9a-z]*}}) + // CHECK-SAME: to (%[[num_threads]]) step (%{{[0-9a-z]*}}) + // CHECK-SAME: shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor) { // CHECK-NEXT: scf.foreach_thread.perform_concurrently { // CHECK-NEXT: tensor.parallel_insert_slice %[[arg0]] into %[[o]][%[[tidx]], 0] [1, 5] [1, 1] - %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor) { + %2 = scf.foreach_thread (%tidx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %arg1) -> (tensor) { %3 = tensor.cast %arg0 : tensor<1x5xf32> to tensor scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %3 into %o[%tidx, %c0] [%c1, 5] [%c1, %c1] : tensor into tensor @@ -1553,10 +1555,10 @@ { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - // CHECK: scf.foreach_thread () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) { + // CHECK: scf.foreach_thread () = () to () step () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) { // CHECK-NEXT: scf.foreach_thread.perform_concurrently { // CHECK-NEXT: tensor.parallel_insert_slice %[[arg0]] into %[[o]][0, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<1x5xf32> - %2 = scf.foreach_thread () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) { + %2 = scf.foreach_thread () = () to () step () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) { scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %arg0 into %o[%c0, %c0] [1, 5] [%c1, %c1] : tensor<1x5xf32> into tensor<1x5xf32> } diff --git a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir --- a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir +++ b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir @@ -29,7 +29,9 @@ // FOREACH-DAG: %[[c5:.+]] = arith.constant 5 : index // FOREACH-DAG: %[[c7:.+]] = arith.constant 7 : index // FOREACH-DAG: %[[init:.+]] = tensor.empty() : tensor<20x11xf32> -// FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) in (%[[c20]]) shared_outs(%[[dest:.+]] = %[[init]]) +// FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) = +//FOREACH-SAME: to (%[[c20]]) +//FOREACH-SAME: shared_outs(%[[dest:.+]] = %[[init]]) // FOREACH: %[[multiIndex:.+]]:3 = affine.delinearize_index %[[iv]] into (%[[c3]], %[[c5]], %[[c7]] // FOREACH: %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex]]#0, %[[multiIndex]]#1, %[[multiIndex]]#2, 0] [1, 1, 1, 11] [1, 1, 1, 1] : // FOREACH: %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3]{{\]}} : @@ -137,7 +139,9 @@ // FOREACH-DAG: %[[d1:.+]] = tensor.dim %[[arg0]], %[[c1]] : // FOREACH-DAG: %[[d2:.+]] = tensor.dim %[[arg0]], %[[c2]] : // FOREACH-DAG: %[[d4:.+]] = tensor.dim %[[arg0]], %[[c4]] : -// FOREACH: %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]]) +// FOREACH: %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) = +//FOREACH-SAME: to (%[[sz1]], %[[sz2]]) +//FOREACH-SAME: shared_outs(%[[dest:.+]] = %[[init]]) // FOREACH-DAG: %[[iv1:.+]] = affine.apply #[[map1]](%[[tid1]])[%[[lb1]]] // FOREACH: %[[multiIndex1:.+]]:3 = affine.delinearize_index %[[iv1]] into (%[[c3]], %[[d1]], %[[d2]]) : // FOREACH-DAG: %[[iv2:.+]] = affine.apply #[[map1]](%[[tid2]])[%[[lb2]]] diff --git a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir --- a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir +++ b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir @@ -88,9 +88,10 @@ // CHECK-NOT: tensor.insert_slice // CHECK: tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor into tensor<1x2xf32> func.func @parallel_insert_slice(%t0: tensor<1x2xf32>, %t1: tensor, %t2: tensor<1x1xf32>) -> tensor<1x2xf32> { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %r = scf.foreach_thread (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) { + %r = scf.foreach_thread (%arg2, %arg3) = (%c0, %c0) to (%c1, %c2) step (%c1, %c1) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) { %inserted_slice = tensor.insert_slice %t1 into %t2[0, 0] [1, 1] [1, 1] : tensor into tensor<1x1xf32> scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %inserted_slice into %arg4[%arg2, %arg3] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x2xf32> diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir --- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir +++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir @@ -42,9 +42,11 @@ func.func @rank_reducing_parallel_insert_of_collapse_shape( %t: tensor, %d: tensor, %sz: index, %thr: index) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %0 = tensor.collapse_shape %t [[0, 1], [2], [3]] : tensor into tensor - %1 = scf.foreach_thread (%iv) in (%thr) shared_outs(%o = %d) -> (tensor) { + %1 = scf.foreach_thread (%iv) = (%c0) to (%thr) step (%c1) shared_outs(%o = %d) -> (tensor) { scf.foreach_thread.perform_concurrently { tensor.parallel_insert_slice %0 into %o[0, 0, 0, 0][%sz, 1, 1, 5][1, 1, 1, 1] : tensor into tensor diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir --- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir @@ -193,11 +193,12 @@ // CHECK-LABEL: func.func @rank_reducing_parallel_insert_slice func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tensor<200x100xf32>) { + %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %num_threads = arith.constant 100 : index // CHECK: scf.foreach_thread {{.*}} { - %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> { + %result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs (%o = %out) -> tensor<200x100xf32> { %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32> scf.foreach_thread.perform_concurrently { // CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>> @@ -335,7 +336,7 @@ // CHECK-LABEL: func @dim_not_reading( // CHECK-SAME: %[[t:.*]]: memref, %f: f32, %pos: index) +func.func @dim_not_reading(%t: tensor, %f: f32, %pos: index) -> (tensor, index) { %c0 = arith.constant 0 : index diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp --- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp +++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp @@ -247,8 +247,14 @@ tensor::ExtractSliceFromCollapseHelper &helper, PatternRewriter &rewriter) const override { Location loc = op.getLoc(); - auto foreachOp = rewriter.create( - loc, /*outputs=*/dest, /*numThreads=*/helper.getIterationSpaceSizes(), + unsigned numLoops = helper.getIterationSpaceSizes().size(); + SmallVector lbs(numLoops, + rewriter.create(loc, 0)); + SmallVector steps(numLoops, + rewriter.create(loc, 1)); + auto foreachThreadOp = rewriter.create( + loc, lbs, /*numThreads=*/helper.getIterationSpaceSizes(), steps, + /*outputs=*/dest, /*mapping=*/ArrayRef{}, [&](OpBuilder &nestedBuilder, Location loc, ValueRange regionArgs) { unsigned numThreadIdRegionArgs = @@ -267,7 +273,7 @@ nestedBuilder.create( loc, tile, outputArgs[0], insertParams); }); - rewriter.replaceOp(op, foreachOp->getResult(0)); + rewriter.replaceOp(op, foreachThreadOp->getResult(0)); return success(); } };