diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -114,6 +114,20 @@ let assemblyFormat = "attr-dict"; } +def GPU_LinearIdOp : GPU_Op<"linear_id", [ + Pure, DeclareOpInterfaceMethods]> { + let description = [{ + Returns the linearized id within the workgroup (block). + + Example: + ```mlir + %laneId = gpu.lane_id + ``` + }]; + let results = (outs Index:$result); + let assemblyFormat = "attr-dict"; +} + def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [ Pure, DeclareOpInterfaceMethods]>, Arguments<(ins)>, Results<(outs Index:$result)> { diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td @@ -64,6 +64,22 @@ }]; } +def LinearIdEnum : I64EnumAttr<"LinearId", "threads for loop mapping", [ + DimX, DimY, DimZ]> { + let cppNamespace = "::mlir::gpu"; +} + +def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [ + DeclareAttrInterfaceMethods ] > { + let parameters = (ins + EnumParameter:$linear_id + ); + let assemblyFormat = "`<` params `>`"; + let description = [{ + An attribute that allows defining thread parallelism for GPU devices. + }]; +} + def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [ DimX, DimY, DimZ]> { let cppNamespace = "::mlir::gpu"; diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h @@ -33,33 +33,71 @@ namespace transform { namespace gpu { +constexpr int64_t kWarpSize = 32; + +/// Helper type for functions that generate ids for the mapping of a +/// scf.forall. +struct IdBuilderResult { + // Ops used to replace the forall induction variables. + SmallVector mappingIdOps; + // Actual mapping sizes used to predicate the forall body whenthey are smaller + // than the availableMappingSizes. + SmallVector predicateMappingSizes; + // Ops used to predicate the forall body when predicateMappingSizes is smaller + // than the availableMappingSizes. + SmallVector predicateIdOps; +}; +using GpuIdBuilderFnType = llvm::function_ref, ArrayRef)>; + +/// Helper struct for passing the mapping attributes and id generator to the +/// common forall rewriter. +struct GpuIdBuilder { + /// The mapping attributes targeted by this generator. + SmallVector mappingAttributes; + /// The constructor that builds the concrete IR for mapping ids. + GpuIdBuilderFnType idBuilder; +}; + /// Map the top level `scf.forall` op to GPU Thread Blocks. /// Mapping is one-to-one and the induction variables of `scf.forall` are -/// rewritten to gpu.block_id according to the thread_dim_apping attribute. +/// rewritten to gpu.block_id according to the thread_dim_mapping attribute. /// Dynamic, `scf.forall` trip counts are currently not supported. /// Dynamic block dim sizes are currently not supported. -DiagnosedSilenceableFailure mapForallToBlocksImpl( - RewriterBase &rewriter, TransformOpInterface transformOp, - scf::ForallOp forallOp, SmallVectorImpl &gridDims, - const ArrayRef &mappingAttributes, - function_ref &)> - blockIdGenerator); - -/// Search `scf.forall` ops nested under `target` and map each such op to GPU -/// threads. Mapping is one-to-one and the induction variables of `scf.forall` -/// are rewritten to gpu.thread_id according to the thread_dim_mapping -/// attribute. -/// Sibling `scf.forall` are supported in which case, the union of the number of -/// threads is computed and may result in predication. +DiagnosedSilenceableFailure +mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, + scf::ForallOp forallOp, + SmallVectorImpl &gridDims, + const GpuIdBuilder &gpuIdBuilder); + +/// Search `scf.forall` ops nested under `target` and map each such op to an +/// explicit GPU implementation along `availableMappingSizes`. +/// The mapping is one-to-one and the induction variables of `scf.forall` are +/// rewritten to gpuIdBuilder.idBuilder according to the +/// gpuIdBuilder.mappingAttributes attribute. /// Dynamic, `scf.forall` trip counts are currently not supported. -/// Dynamic block dim sizes are currently not supported. +/// Dynamic `availableMappingSizes` sizes are currently not supported. +/// `availableMappingSizes` is expected to be of size 3. +DiagnosedSilenceableFailure mapOneForallToThreadsImpl( + RewriterBase &rewriter, std::optional transformOp, + scf::ForallOp forallOp, ArrayRef availableMappingSizes, + bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder); + +/// Search `scf.forall` ops nested under `target` and map each such op to an +/// explicit GPU implementation along blockDims, warpDims and linearDims. +/// The mapping is one-to-one and the induction variables of `scf.forall` are +/// rewritten to threads/warps/linear ids according. +/// Dynamic, `scf.forall` trip counts are currently not supported. +/// Dynamic `blockDims`, `warpDims` or `linearDims` sizes are currently not +/// supported. +/// `blockDims` is expected to be of size 3. +/// `warpDims` is expected to be empty or of size 3. +/// The insertion point is expected to be set at the beginning of the target +/// body block and dominate all other blocks. DiagnosedSilenceableFailure mapNestedForallToThreadsImpl( RewriterBase &rewriter, std::optional transformOp, - Operation *target, const SmallVectorImpl &kernelBlockDims, - bool syncAfterDistribute, - const ArrayRef &threadMappingAttributes, - function_ref &)> - threadIdGenerator); + Operation *target, ArrayRef blockDims, ArrayRef warpDims, + bool syncAfterDistribute); /// Find the unique top level scf::ForallOp within a given target op. DiagnosedSilenceableFailure diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -22,21 +22,26 @@ TransformEachOpTrait, TransformOpInterface]> { let description = [{ - Target the `gpu.launch op` and rewrite all `scf.forall` - nested in it to distributed `gpu.thread_id` attribute. - - The operation searches for `scf.forall` ops nested under `target` - and maps each such op to GPU threads. Mapping is one-to-one and the - induction variables of `scf.forall` are rewritten to - `gpu.thread_id` according to the `mapping` attribute. - - Sibling `scf.forall` are supported in which case, the union of - the number of threads is computed and may result in predication. - - Multiple scf.forall are supported per `gpu.launch` in which case, - the max of all the threads is computed and taken for the global - `gpu.thread_id`. If necessary, `scf.forall` that do not use the - whole thread range result in predicated computations. + Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to + distributed `gpu.thread_id` attribute. + + The operation searches for `scf.forall` ops nested under `target` and maps + each such op to GPU threads. + + `scf.forall` induction variables are rewritten to `gpu.thread_id` according + to the `mapping` attribute. + + Different types of mappings attributes are supported: + - the block_dims is a list of integers that specifies the number of + threads in each dimension. This is a mandatory attribute that is used + to constrain the number of threads in each dimension. If an + `scf.forall` op is mapped to fewer threads, predication occurs. + - the warp_dims is a list of integers that specifies the number of + warps in each dimension. This is an optional attribute that is used + to constrain the number of warps in each dimension. When present, this + attribute must be specified in a way that is compatible with the + block_dims attribute. If an `scf.forall` op is mapped to fewer warps, + predicaiton occurs. Dynamic `scf.forall` trip counts are currently not supported. Dynamic block dim sizes are currently not supported. @@ -45,10 +50,12 @@ Only `scf.forall` distributed to **at most 3 dimensions** are currently supported. - Barriers are inserted after each scf.forall op for now. + The `sync_after_distribute`attribute controls whether a `gpu.barrier` is + inserted after each scf.forall op. At this time, this is an all or nothing + choice. This will need to be tightened in the future. - The operation alters the block size of the given gpu_launch using - blockDim argument. + The operation alters the block size of the given gpu_launch using the + mandatory block_dims argument. #### Return modes: @@ -83,6 +90,7 @@ gpu.terminator } ``` + is translated to: ``` @@ -104,11 +112,18 @@ }]; let arguments = (ins PDL_Operation:$target, - DefaultValuedAttr:$blockDim, - DefaultValuedAttr:$syncAfterDistribute); + DefaultValuedAttr:$block_dims, + DefaultValuedOptionalAttr:$warp_dims, + DefaultValuedAttr:$sync_after_distribute); let results = (outs PDL_Operation:$result); - let assemblyFormat = "$target attr-dict"; + let assemblyFormat = [{ + $target + `block_dims` `=` $block_dims + (`warp_dims` `=` $warp_dims^)? + (`sync_after_distribute` `=` $sync_after_distribute^)? + attr-dict + }]; let extraClassDeclaration = [{ ::mlir::DiagnosedSilenceableFailure applyToOne( ::mlir::Operation *target, @@ -117,7 +132,6 @@ }]; } - def MapForallToBlocks : Op:$gridDim, + DefaultValuedOptionalAttr:$grid_dims, UnitAttr:$generate_gpu_launch); let results = (outs PDL_Operation:$result); - let assemblyFormat = "$target attr-dict"; + let assemblyFormat = [{ + $target + (`generate_gpu_launch` $generate_gpu_launch^)? + (`grid_dims` `=` $grid_dims^)? + attr-dict + }]; let extraClassDeclaration = [{ ::mlir::DiagnosedSilenceableFailure applyToOne( ::mlir::Operation *target, diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -46,6 +46,10 @@ return static_cast(getWarp()); } +int64_t GPULinearIdMappingAttr::getMappingId() const { + return static_cast(getLinearId()); +} + int64_t GPUThreadMappingAttr::getMappingId() const { return static_cast(getThread()); } diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp --- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp @@ -118,6 +118,11 @@ setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL)); } +void LinearIdOp::inferResultRanges(ArrayRef, + SetIntRangeFn setResultRange) { + setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL)); +} + void SubgroupIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL)); diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -8,7 +8,9 @@ #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" #include "mlir/Dialect/PDL/IR/PDL.h" @@ -16,9 +18,14 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" #include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Visitors.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -27,6 +34,7 @@ using namespace mlir; using namespace mlir::gpu; using namespace mlir::transform; +using namespace mlir::transform::gpu; #define DEBUG_TYPE "gpu-transforms" @@ -35,68 +43,163 @@ namespace { -/// Helper type for functions that generate ids for the mapping of a scf.forall. -using IdGeneratorFnType = llvm::function_ref &)>; +/// Return a flattened thread id for the workgroup with given sizes. +static OpFoldResult getStaticLinearThreadId(RewriterBase &rewriter, + Location loc, + ArrayRef blockDims) { + assert(blockDims.size() == 3 && "expected 3 workgroup sizes"); + AffineExpr tx, ty, tz, BDX, BDY; + bindDims(rewriter.getContext(), tx, ty, tz); + bindSymbols(rewriter.getContext(), BDX, BDY); + IndexType indexType = rewriter.getIndexType(); + SmallVector threadsAndWorkGroups{ + rewriter.create(loc, indexType, Dimension::x).getResult(), + rewriter.create(loc, indexType, Dimension::y).getResult(), + rewriter.create(loc, indexType, Dimension::z).getResult()}; + threadsAndWorkGroups.push_back(blockDims[0]); + threadsAndWorkGroups.push_back(blockDims[1]); + return makeComposedFoldedAffineApply( + rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups); +} -struct MappingToGpuHelper { - MappingToGpuHelper(SmallVector mappingAttributes, - IdGeneratorFnType idGenerator) - : mappingAttributes(mappingAttributes), idGenerator(idGenerator) {} +struct GpuBlockIdBuilder : public GpuIdBuilder { + + GpuBlockIdBuilder(MLIRContext *ctx) : GpuIdBuilder() { + mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX), + GPUBlockMappingAttr::get(ctx, Blocks::DimY), + GPUBlockMappingAttr::get(ctx, Blocks::DimZ)}, + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef forallMappingSizes, + ArrayRef availableMappingSizes) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + IndexType indexType = rewriter.getIndexType(); + auto loc = forallOp->getLoc(); + SmallVector ids{ + rewriter.create(loc, indexType, Dimension::x), + rewriter.create(loc, indexType, Dimension::y), + rewriter.create(loc, indexType, Dimension::z)}; + return IdBuilderResult{ids, SmallVector{forallMappingSizes}, + ids}; + }; + } +}; - SmallVector mappingAttributes; - IdGeneratorFnType idGenerator; +struct GpuThreadIdBuilder : public GpuIdBuilder { + GpuThreadIdBuilder(MLIRContext *ctx) : GpuIdBuilder() { + mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX), + GPUThreadMappingAttr::get(ctx, Threads::DimY), + GPUThreadMappingAttr::get(ctx, Threads::DimZ)}; + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef forallMappingSizes, + ArrayRef availableMappingSizes) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + IndexType indexType = rewriter.getIndexType(); + auto loc = forallOp->getLoc(); + SmallVector ids{ + rewriter.create(loc, indexType, Dimension::x), + rewriter.create(loc, indexType, Dimension::y), + rewriter.create(loc, indexType, Dimension::z)}; + return IdBuilderResult{ids, SmallVector{forallMappingSizes}, + ids}; + }; + } }; -struct MappingToGpuBlocksHelper : public MappingToGpuHelper { - - MappingToGpuBlocksHelper(MLIRContext *ctx) - : MappingToGpuHelper( - SmallVector{ - GPUBlockMappingAttr::get(ctx, Blocks::DimX), - GPUBlockMappingAttr::get(ctx, Blocks::DimY), - GPUBlockMappingAttr::get(ctx, Blocks::DimZ)}, - IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp, - SmallVectorImpl &ids) { - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(forallOp); - IndexType indexType = rewriter.getIndexType(); - auto loc = forallOp->getLoc(); - ids.assign( - {rewriter.create(loc, indexType, Dimension::x), - rewriter.create(loc, indexType, Dimension::y), - rewriter.create(loc, indexType, Dimension::z)}); - }}) {} +struct GpuWarpIdBuilder : public GpuIdBuilder { + GpuWarpIdBuilder(MLIRContext *ctx) : GpuIdBuilder() { + mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX), + GPUWarpMappingAttr::get(ctx, Warps::DimY), + GPUWarpMappingAttr::get(ctx, Warps::DimZ)}; + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef forallMappingSizes, + ArrayRef availableMappingSizes) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + Location loc = forallOp.getLoc(); + Value warpId = rewriter.create(loc); + SmallVector reverseBasisSizes( + llvm::reverse(availableMappingSizes)); + LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, + DBGS() << "--delinearization basis: "); + llvm::dbgs() << "\n"); + + SmallVector strides = computeStrides(reverseBasisSizes); + LLVM_DEBUG(llvm::interleaveComma(strides, + DBGS() << "--delinearization strides: "); + llvm::dbgs() << "\n"); + + AffineExpr d0; + bindDims(rewriter.getContext(), d0); + SmallVector delinearizingExprs = delinearize(d0, strides); + LLVM_DEBUG(llvm::interleaveComma(delinearizingExprs, + DBGS() << "--delinearization exprs: "); + llvm::dbgs() << "\n"); + + SmallVector ids; + for (AffineExpr e : delinearizingExprs) + ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId)); + LLVM_DEBUG(llvm::interleaveComma(ids, DBGS() << "--ids: "); + llvm::dbgs() << "\n"); + return IdBuilderResult{ids, SmallVector{forallMappingSizes}, + ids}; + }; + } }; -struct MappingToGpuThreadsHelper : public MappingToGpuHelper { - MappingToGpuThreadsHelper(MLIRContext *ctx) - : MappingToGpuHelper( - SmallVector{ - GPUThreadMappingAttr::get(ctx, Threads::DimX), - GPUThreadMappingAttr::get(ctx, Threads::DimY), - GPUThreadMappingAttr::get(ctx, Threads::DimZ)}, - IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp, - SmallVectorImpl &ids) { - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(forallOp); - IndexType indexType = rewriter.getIndexType(); - auto loc = forallOp->getLoc(); - ids.assign( - {rewriter.create(loc, indexType, Dimension::x), - rewriter.create(loc, indexType, Dimension::y), - rewriter.create(loc, indexType, Dimension::z)}); - }}) {} +struct GpuLinearIdBuilder : public GpuIdBuilder { + GpuLinearIdBuilder(MLIRContext *ctx) : GpuIdBuilder() { + mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX), + GPULinearIdMappingAttr::get(ctx, LinearId::DimY), + GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)}; + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef forallMappingSizes, + ArrayRef availableMappingSizes) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + Location loc = forallOp.getLoc(); + Value linearIdOp = rewriter.create(loc); + SmallVector reverseBasisSizes(llvm::reverse(forallMappingSizes)); + LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes, + DBGS() << "--delinearization basis: "); + llvm::dbgs() << "\n"); + + SmallVector strides = computeStrides(reverseBasisSizes); + LLVM_DEBUG(llvm::interleaveComma(strides, + DBGS() << "--delinearization strides: "); + llvm::dbgs() << "\n"); + + AffineExpr d0; + bindDims(rewriter.getContext(), d0); + SmallVector delinearizingExprs = delinearize(d0, strides); + LLVM_DEBUG(llvm::interleaveComma(delinearizingExprs, + DBGS() << "--delinearization exprs: "); + llvm::dbgs() << "\n"); + + SmallVector ids; + for (AffineExpr e : delinearizingExprs) + ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearIdOp)); + LLVM_DEBUG(llvm::interleaveComma(ids, DBGS() << "--ids: "); + llvm::dbgs() << "\n"); + + int64_t actualMappingSize = 1; + for (int64_t s : forallMappingSizes) + actualMappingSize *= s; + return IdBuilderResult{ids, SmallVector{actualMappingSize}, + SmallVector{linearIdOp}}; + }; + } }; } // namespace static DiagnosedSilenceableFailure -failureHelper(std::optional transformOp, - scf::ForallOp forallOp, const Twine &message) { +definiteFailureHelper(std::optional transformOp, + Operation *target, const Twine &message) { if (transformOp.has_value()) - return emitDefiniteFailure(*transformOp, message); - return emitDefiniteFailure(forallOp, message); + return transformOp->emitDefiniteFailure() << message; + return emitDefiniteFailure(target, message); } /// Check if given mapping attributes are one of the desired attributes @@ -104,7 +207,8 @@ checkMappingAttributeTypes(std::optional transformOp, scf::ForallOp forallOp) { if (!forallOp.getMapping().has_value()) - return failureHelper(transformOp, forallOp, "mapping must be present"); + return definiteFailureHelper(transformOp, forallOp, + "mapping must be present"); bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { @@ -114,20 +218,32 @@ llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { return attr.isa(); }); + bool hasWarpMapping = + llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { + return attr.isa(); + }); + bool hasLinearMapping = + llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { + return attr.isa(); + }); int64_t countMappingTypes = 0; countMappingTypes += hasBlockMapping ? 1 : 0; countMappingTypes += hasThreadMapping ? 1 : 0; + countMappingTypes += hasWarpMapping ? 1 : 0; + countMappingTypes += hasLinearMapping ? 1 : 0; if (countMappingTypes > 1) { - return failureHelper(transformOp, forallOp, - "cannot mix different mapping types, use nesting"); + return definiteFailureHelper( + transformOp, forallOp, + "cannot mix different mapping types, use nesting"); } DenseSet seen; for (Attribute map : forallOp.getMapping()->getValue()) { if (llvm::is_contained(seen, map)) { - return failureHelper(transformOp, forallOp, - "duplicated attribute, cannot map different loops " - "to the same processor"); + return definiteFailureHelper( + transformOp, forallOp, + "duplicated attribute, cannot map different loops " + "to the same processor"); } seen.insert(map); } @@ -146,26 +262,26 @@ // Perform other non-types verifications. if (!forallOp.isNormalized()) - return failureHelper(transformOp, forallOp, - "unsupported non-normalized loops"); + return definiteFailureHelper(transformOp, forallOp, + "unsupported non-normalized loops"); if (forallOp.getNumResults() > 0) - return failureHelper(transformOp, forallOp, - "only bufferized scf.forall can be mapped"); + return definiteFailureHelper(transformOp, forallOp, + "only bufferized scf.forall can be mapped"); if (forallOp.getRank() > 3) - return failureHelper(transformOp, forallOp, - "scf.forall with rank > 3 does not lower"); + return definiteFailureHelper(transformOp, forallOp, + "scf.forall with rank > 3 does not lower"); if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) { return !getConstantIntValue(ofr).has_value(); })) { - return failureHelper(transformOp, forallOp, - "unsupported dynamic sizes in forall op"); + return definiteFailureHelper(transformOp, forallOp, + "unsupported dynamic sizes in forall op"); } return DiagnosedSilenceableFailure::success(); } -/// Determines if the size of the kernel configuration is supported by the GPU -/// architecture being used. It presently makes use of CUDA limitations, however -/// that aspect may be enhanced for other GPUs. +/// Determines if the size of the kernel configuration is supported by the +/// GPU architecture being used. It presently makes use of CUDA limitations, +/// however that aspect may be enhanced for other GPUs. static DiagnosedSilenceableFailure checkGpuLimits( TransformOpInterface transformOp, std::optional gridDimX, std::optional gridDimY, std::optional gridDimZ, @@ -192,17 +308,17 @@ gridDimZ.value_or(1) > maxGriddimz || gridDimX.value_or(1) > maxGriddimx) { return transformOp.emitSilenceableError() - << "Trying to launch a GPU kernel with gridDim = (" + << "Trying to launch a GPU kernel with grid_dims = (" << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " - << gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1) - << ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1) - << "). It is larger than the limits."; + << gridDimZ.value_or(1) << ") block_dims = (" + << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", " + << blockDimZ.value_or(1) << "). It is larger than the limits."; } return DiagnosedSilenceableFailure::success(); } -/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and -/// put a terminator within. +/// Creates an empty-body gpu::LaunchOp using the provided kernel settings +/// and put a terminator within. static DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, LaunchOp &launchOp, @@ -278,24 +394,36 @@ return DiagnosedSilenceableFailure::success(); } -//===----------------------------------------------------------------------===// -// MapForallToBlocks -//===----------------------------------------------------------------------===// +/// Struct to return the result of the rewrite of a forall operation. +struct ForallRewriteResult { + SmallVector mappingSizes; + SmallVector mappingIds; +}; -static FailureOr> rewriteOneForallCommonImpl( +/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR. +template +static void +replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, + OperationOrBlock *parent, Value replacement, + ArrayRef availableMappingSizes) { + parent->walk([&](OpTy idOp) { + if (availableMappingSizes[static_cast(idOp.getDimension())] == 1) + rewriter.replaceAllUsesWith(idOp.getResult(), replacement); + }); +} + +static DiagnosedSilenceableFailure rewriteOneForallCommonImpl( RewriterBase &rewriter, std::optional transformOp, - scf::ForallOp forallOp, - const SmallVectorImpl &availableMappingSizes, - const ArrayRef &allMappingAttributes, - IdGeneratorFnType idGenerator) { + scf::ForallOp forallOp, ForallRewriteResult &result, + ArrayRef availableMappingSizes, const GpuIdBuilder &gpuIdBuilder) { LDBG("Start rewriteOneForallCommonImpl"); // Step 0. GPU-specific verifications. There is no better place to anchor - // those right now: the ForallOp is target-independent and the transform op - // does not apply to individual ForallOp. + // those right now: the ForallOp is target-independent and the transform + // op does not apply to individual ForallOp. DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp); if (!diag.succeeded()) - return failure(); + return diag; // Step 1. Complete the mapping to a full mapping (with 1s) if necessary. SmallVector tmpMappingSizes = llvm::to_vector( @@ -304,48 +432,42 @@ assert(maybeStaticValue && "expected static value"); return maybeStaticValue.value(); })); - SmallVector forallMappings = + SmallVector forallMappingAttrs = llvm::to_vector(forallOp.getMapping()->getValue()); - for (auto attr : allMappingAttributes) { - if (llvm::is_contained(forallMappings, attr)) + for (auto attr : gpuIdBuilder.mappingAttributes) { + if (llvm::is_contained(forallMappingAttrs, attr)) continue; - forallMappings.push_back(attr); + forallMappingAttrs.push_back(attr); tmpMappingSizes.push_back(1); } + LLVM_DEBUG(llvm::interleaveComma( + tmpMappingSizes, + DBGS() << "--tmpMappingSizes extracted from scf.forall op: "); + llvm::dbgs() << "\n"); // Step 2. sort the values by the corresponding DeviceMappingAttrInterface. auto comparator = [&](DeviceMappingAttrInterface a, DeviceMappingAttrInterface b) -> bool { return a.getMappingId() < b.getMappingId(); }; - SmallVector mappingSizes = - getValuesSortedByKey(forallMappings, tmpMappingSizes, comparator); - LLVM_DEBUG(llvm::interleaveComma(mappingSizes, DBGS() << "mappingSizes: "); - llvm::dbgs() << "\n"; - llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: "); + SmallVector forallMappingSizes = + getValuesSortedByKey(forallMappingAttrs, tmpMappingSizes, comparator); + LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes, + DBGS() << "--forallMappingSizes: "); + llvm::dbgs() << "\n"; llvm::interleaveComma( + forallMappingAttrs, DBGS() << "--mappingAttrs: "); llvm::dbgs() << "\n"); - // Step 3. Generate the mappingIdOps using the provided generator and map the - // induction variables to the newly created ops. Replace ids of dimension - // known to be of size 1 by zero to simplify the IR. - SmallVector mappingIdOps; - Location loc = forallOp.getLoc(); - idGenerator(rewriter, forallOp, mappingIdOps); - LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: "); - llvm::dbgs() << "\n"); - assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes"); - Value zero = rewriter.create(loc, 0); - if (!availableMappingSizes.empty()) { - for (size_t i : llvm::seq(size_t(0), availableMappingSizes.size())) { - if (availableMappingSizes[i] == 1) - mappingIdOps[i] = zero; - } - } + // Step 3. Generate the mappingIdOps using the provided generator and map + // the induction variables to the newly created ops. + IdBuilderResult builderResult = gpuIdBuilder.idBuilder( + rewriter, forallOp, forallMappingSizes, availableMappingSizes); + SmallVector mappingIdOps = builderResult.mappingIdOps; IRMapping bvm; for (auto [iv, dim] : llvm::zip_equal(forallOp.getInductionVars(), - ArrayRef{forallMappings}.take_front( + ArrayRef{forallMappingAttrs}.take_front( forallOp.getInductionVars().size()))) { Value peIdOp = mappingIdOps[static_cast( dim.cast().getMappingId())]; @@ -354,28 +476,39 @@ // Step 4. Maybe create conditionals to predicate the region. // Skip this step when availableMappingSizes is empty. + Location loc = forallOp.getLoc(); Value predicate; if (!availableMappingSizes.empty()) { - LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes, - DBGS() << "availableMappingSizes: "); - llvm::dbgs() << "\n"); - for (auto [id, mappingSize, availableMappingSize] : - llvm::zip_equal(mappingIdOps, mappingSizes, availableMappingSizes)) { + SmallVector predicateMappingSizes = + builderResult.predicateMappingSizes; + SmallVector predicateIdOps = builderResult.predicateIdOps; + // clang-format off + LLVM_DEBUG( + llvm::interleaveComma( + predicateMappingSizes, DBGS() << "--predicateMappingSizes: "); + llvm::dbgs() << "\n"; + llvm::interleaveComma( + availableMappingSizes, DBGS() << "--availableMappingSizes: "); + llvm::dbgs() << "\n"; + llvm::interleaveComma(predicateIdOps, DBGS() << "--predicateIdOps: "); + llvm::dbgs() << "\n"); + // clang-format on + for (auto [id, mappingSize, availableMappingSize] : llvm::zip_equal( + predicateIdOps, predicateMappingSizes, availableMappingSizes)) { if (mappingSize > availableMappingSize) { - (void)failureHelper( + return definiteFailureHelper( transformOp, forallOp, "Trying to map to fewer GPU threads than loop iterations but " "overprovisioning is not yet supported. " "Try additional tiling of the before mapping or map to more " "threads."); - return failure(); } if (mappingSize == availableMappingSize) continue; Value idx = rewriter.create(loc, mappingSize); Value tmpPredicate = rewriter.create( loc, arith::CmpIPredicate::ult, id, idx); - LDBG("predicate: " << tmpPredicate); + LDBG("--predicate: " << tmpPredicate); predicate = predicate ? rewriter.create(loc, predicate, tmpPredicate) : tmpPredicate; @@ -389,12 +522,13 @@ Block::iterator insertionPoint; if (predicate) { // Step 5.a. If predicated, move at the beginning. - auto ifOp = - rewriter.create(loc, predicate, /*withElseRegion=*/false); + auto ifOp = rewriter.create(loc, predicate, + /*withElseRegion=*/false); targetBlock = ifOp.thenBlock(); insertionPoint = ifOp.thenBlock()->begin(); } else { - // Step 5.b. Otherwise, move inline just at the rewriter insertion point. + // Step 5.b. Otherwise, move inline just at the rewriter insertion + // point. targetBlock = forallOp->getBlock(); insertionPoint = rewriter.getInsertionPoint(); } @@ -402,7 +536,7 @@ targetBlock->getOperations().splice(insertionPoint, sourceBlock.getOperations()); - // Step 6. RAUW thread indices to thread ops. + // Step 6. RAUW indices. for (Value loopIndex : forallOp.getInductionVars()) { Value threadIdx = bvm.lookup(loopIndex); rewriter.replaceAllUsesWith(loopIndex, threadIdx); @@ -411,23 +545,49 @@ // Step 7. Erase old op. rewriter.eraseOp(forallOp); - return mappingSizes; + result = ForallRewriteResult{forallMappingSizes, mappingIdOps}; + return DiagnosedSilenceableFailure::success(); } +//===----------------------------------------------------------------------===// +// MapForallToBlocks +//===----------------------------------------------------------------------===// + DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl( RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl &gridDims, - const ArrayRef &allMappingAttributes, - IdGeneratorFnType idGenerator) { - // Pass an empty anyAvailableMappingSizes. + const GpuIdBuilder &gpuIdBuilder) { + + // Create an early zero index value for replacements. + Location loc = forallOp.getLoc(); + Block *parentBlock = forallOp->getBlock(); + Value zero; + { + // RAII block. + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(parentBlock); + zero = rewriter.create(loc, 0); + } + SmallVector anyAvailableMappingSizes; - FailureOr> maybeMappingSizes = - rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, - anyAvailableMappingSizes, allMappingAttributes, - idGenerator); - if (failed(maybeMappingSizes)) - return DiagnosedSilenceableFailure::definiteFailure(); - gridDims = *maybeMappingSizes; + ForallRewriteResult rewriteResult; + // Pass an empty anyAvailableMappingSizes. + DiagnosedSilenceableFailure diag = + rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult, + anyAvailableMappingSizes, gpuIdBuilder); + + // Return if anything goes wrong, use silenceable failure as a match failure. + if (!diag.succeeded()) + return diag; + + // Set the gridDims that act as a return. + gridDims = rewriteResult.mappingSizes; + + // Replace ids of dimensions known to be 1 by 0 to simplify the IR. + // Here, the result of mapping determines the available mapping sizes. + replaceUnitMappingIdsHelper(rewriter, loc, parentBlock, zero, + gridDims); + return DiagnosedSilenceableFailure::success(); } @@ -476,7 +636,7 @@ return diag; } - SmallVector gridDims = extractFromI64ArrayAttr(getGridDim()); + SmallVector gridDims{getGridDims()}; if (!getGenerateGpuLaunch() && gridDims.size() != 3) return transformOp.emitDefiniteFailure("transform require size-3 mapping"); @@ -496,17 +656,14 @@ topLevelForallOp = cast(newForallOp); } - diag = verifyGpuMapping(transformOp, topLevelForallOp); - if (!diag.succeeded()) - return diag; - - MappingToGpuBlocksHelper helper(getContext()); + GpuBlockIdBuilder gpuBlockIdBuilder(getContext()); diag = mlir::transform::gpu::mapForallToBlocksImpl( - rewriter, transformOp, topLevelForallOp, gridDims, - helper.mappingAttributes, helper.idGenerator); + rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder); if (!diag.succeeded()) return diag; + // Set the GPU launch configuration for the grid dims late, this is subject to + // IR inspection. diag = alterGpuLaunch(rewriter, gpuLaunch, cast(getOperation()), gridDims[0], gridDims[1], gridDims[2]); @@ -519,37 +676,155 @@ // MapNestedForallToThreads //===----------------------------------------------------------------------===// +DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl( + RewriterBase &rewriter, std::optional transformOp, + scf::ForallOp forallOp, ArrayRef availableMappingSizes, + bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) { + // Ignore cases with different attributes than this builder supports. + for (Attribute map : forallOp.getMapping()->getValue()) { + if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) { + LDBG("--skip " << map); + LLVM_DEBUG(llvm::interleaveComma(gpuIdBuilder.mappingAttributes, + DBGS() << "----not in: "); + llvm::dbgs() << "\n";); + return emitSilenceableFailure(forallOp); + } + } + + Location loc = forallOp.getLoc(); + OpBuilder::InsertionGuard g(rewriter); + // Insert after to allow for syncthreads after `forall` is erased. + rewriter.setInsertionPointAfter(forallOp); + ForallRewriteResult rewriteResult; + DiagnosedSilenceableFailure diag = + rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult, + availableMappingSizes, gpuIdBuilder); + + // Return if anything goes wrong, use silenceable failure as a match failure. + if (!diag.succeeded()) + return diag; + + // Add a syncthreads if needed. TODO: warpsync + if (syncAfterDistribute) + rewriter.create(loc); + + return DiagnosedSilenceableFailure::success(); +} + DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl( RewriterBase &rewriter, std::optional transformOp, - Operation *target, const SmallVectorImpl &kernelBlockDims, - bool syncAfterDistribute, - const ArrayRef &allMappingAttributes, - IdGeneratorFnType idGenerator) { + Operation *target, ArrayRef blockDims, ArrayRef warpDims, + bool syncAfterDistribute) { + MLIRContext *ctx = rewriter.getContext(); + + if (blockDims.size() != 3) + return definiteFailureHelper(transformOp, target, + "requires size-3 thread mapping"); + if (!warpDims.empty()) { + if (warpDims.size() != 3) + return definiteFailureHelper(transformOp, target, + "requires empty or size-3 warp mapping"); + } + + // Create an early zero index value for replacements. + Location loc = target->getLoc(); + Value zero = rewriter.create(loc, 0); + SmallVector blockDimsOfr = + getAsIndexOpFoldResult(ctx, blockDims); + DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success(); - target->walk([&](scf::ForallOp forallOp) { - // Ignore cases with different attributes. - for (Attribute map : forallOp.getMapping()->getValue()) { - if (!llvm::is_contained(allMappingAttributes, map)) { + WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) { + //===--------------------------------------------------------------------===// + // Mapping to warp ids. + //===--------------------------------------------------------------------===// + if (!warpDims.empty()) { + LLVM_DEBUG( + llvm::interleaveComma( + warpDims, DBGS() << "mapNestedForallToThreadsImpl warpDims: "); + llvm::dbgs() << "\n"); + GpuWarpIdBuilder gpuWarpIdBuilder(ctx); + diag = mlir::transform::gpu::mapOneForallToThreadsImpl( + rewriter, transformOp, forallOp, warpDims, syncAfterDistribute, + gpuWarpIdBuilder); + // Use silenceable failure to encode "failure to match" and pass + // through. + if (diag.isDefiniteFailure()) + return WalkResult::interrupt(); + + // Perform late SubgroupIdOp replacement, taking blockDims into + // account. + if (diag.succeeded()) { + target->walk([&](SubgroupIdOp subgroupIdOp) { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(subgroupIdOp); + auto linearThreadId = getStaticLinearThreadId( + rewriter, subgroupIdOp.getLoc(), blockDimsOfr); + LDBG("----linearThreadId: " << linearThreadId); + + AffineExpr ltid = getAffineDimExpr(0, ctx); + auto warpId = makeComposedFoldedAffineApply( + rewriter, subgroupIdOp.getLoc(), ltid.floorDiv(kWarpSize), + {linearThreadId}); + LDBG("----warpId: " << warpId); + rewriter.replaceAllUsesWith(subgroupIdOp, warpId.get()); + }); return WalkResult::skip(); } } - diag = verifyGpuMapping(transformOp, forallOp); + + //===--------------------------------------------------------------------===// + // Mapping to linear ids. + //===--------------------------------------------------------------------===// + LDBG("mapNestedForallToThreadsImpl linearDims"); + int64_t numThreads = 1; + for (int64_t b : blockDims) + numThreads *= b; + GpuLinearIdBuilder gpuLinearIdBuilder(ctx); + diag = mlir::transform::gpu::mapOneForallToThreadsImpl( + rewriter, transformOp, forallOp, {numThreads}, syncAfterDistribute, + gpuLinearIdBuilder); + // Use silenceable failure to encode "failure to match" and pass through. + if (diag.isDefiniteFailure()) + return WalkResult::interrupt(); if (diag.succeeded()) { - // Take the loc ahead of time - Location loc = forallOp.getLoc(); - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPointAfter(forallOp); - if (failed(rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, - kernelBlockDims, - allMappingAttributes, idGenerator))) - diag = DiagnosedSilenceableFailure::definiteFailure(); - // Add a syncthreads if needed. TODO: warpsync - if (syncAfterDistribute) - rewriter.create(loc); + // Perform late replacement of LinearIdOp, taking blockDims into account. + target->walk([&](LinearIdOp linearIdOp) { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(linearIdOp); + auto linearThreadId = getStaticLinearThreadId( + rewriter, linearIdOp.getLoc(), blockDimsOfr); + LDBG("----linearThreadId: " << linearThreadId); + rewriter.replaceAllUsesWith(linearIdOp, linearThreadId.get()); + }); + return WalkResult::skip(); } - return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt(); + + //===--------------------------------------------------------------------===// + // Mapping to block ids (happens last so we can replay ThreadIdOp). + //===--------------------------------------------------------------------===// + LLVM_DEBUG( + llvm::interleaveComma( + blockDims, DBGS() << "mapNestedForallToThreadsImpl blockDims: "); + llvm::dbgs() << "\n"); + GpuThreadIdBuilder gpuThreadIdBuilder(ctx); + diag = mlir::transform::gpu::mapOneForallToThreadsImpl( + rewriter, transformOp, forallOp, blockDims, syncAfterDistribute, + gpuThreadIdBuilder); + // Use silenceable failure to encode "failure to match" and pass through. + if (diag.isDefiniteFailure()) + return WalkResult::interrupt(); + + return WalkResult::advance(); }); - return diag; + if (walkResult.wasInterrupted()) + return diag; + + // Replace ids of dimensions known to be 1 by 0 to simplify the IR. + // Here, the result of mapping determines the available mapping sizes. + replaceUnitMappingIdsHelper(rewriter, loc, target, zero, + blockDims); + + return DiagnosedSilenceableFailure::success(); } DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne( @@ -561,32 +836,29 @@ if (!gpuLaunch) return emitSilenceableError() << "Given target is not a gpu.launch"; - SmallVector blockDims = extractFromI64ArrayAttr(getBlockDim()); - if (blockDims.size() != 3) - return transformOp.emitDefiniteFailure("transform require size-3 mapping"); + // Mapping to block ids. + SmallVector blockDims{getBlockDims()}; DiagnosedSilenceableFailure diag = checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt, blockDims[0], blockDims[1], blockDims[2]); if (diag.isSilenceableFailure()) { - diag.attachNote(getLoc()) << getBlockDimAttrName() << " is too large"; + diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large"; return diag; } - MLIRContext *ctx = getContext(); - IRRewriter rewriter(ctx); - MappingToGpuThreadsHelper helper(ctx); - diag = mlir::transform::gpu::mapNestedForallToThreadsImpl( - rewriter, transformOp, target, blockDims, getSyncAfterDistribute(), - helper.mappingAttributes, helper.idGenerator); - - if (!diag.succeeded()) - return diag; - + // Set the GPU launch configuration for the block dims early, this is not + // subject to IR inspection. + IRRewriter rewriter(getContext()); diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt, std::nullopt, std::nullopt, blockDims[0], blockDims[1], blockDims[2]); + rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); + diag = + mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims, + getWarpDims(), getSyncAfterDistribute()); + results.push_back(gpuLaunch.getOperation()); return diag; } diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir --- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir @@ -8,7 +8,7 @@ ^bb0(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!pdl.operation) -> !pdl.operation // expected-error @below {{Given target is not a gpu.launch}} - %1 = transform.gpu.map_nested_forall_to_threads %funcop + %1 = transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1] } // ----- @@ -47,9 +47,9 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - // expected-error @below {{Trying to launch a GPU kernel with gridDim = (1, 1, 1) blockDim = (1200, 9, 1). It is larger than the limits.}} - // expected-note @below {{"blockDim" is too large}} - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [1200, 9, 1] } + // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (1, 1, 1) block_dims = (1200, 9, 1). It is larger than the limits.}} + // expected-note @below {{"block_dims" is too large}} + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1200, 9, 1] } // ----- @@ -90,7 +90,7 @@ ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation // expected-error @below {{Trying to map to fewer GPU threads than loop iterations but overprovisioning is not yet supported. Try additional tiling of the before mapping or map to more threads.}} - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] } + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] } // ----- @@ -116,7 +116,7 @@ ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation // expected-error @below {{unsupported dynamic sizes}} - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] } + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] } // ----- @@ -138,7 +138,7 @@ %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread, #gpu.thread, #gpu.thread ] ) %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation // expected-error @below {{only bufferized scf.forall can be mapped}} - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] } + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] } // ----- @@ -243,8 +243,8 @@ transform.sequence failures(propagate) { ^bb0(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation - // expected-error @below {{Trying to launch a GPU kernel with gridDim = (65535, 65535, 1) blockDim = (1, 1, 1). It is larger than the limits.}} - %1 = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } + // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (65535, 65535, 1) block_dims = (1, 1, 1). It is larger than the limits.}} + %1 = transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch } // ----- @@ -271,7 +271,7 @@ ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation // expected-error @below {{duplicated attribute, cannot map different loops to the same processor}} - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [32, 32, 1]} + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] } // ----- diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -33,7 +33,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - transform.gpu.map_forall_to_blocks %funcop { gridDim = [12, 9, 1]} + transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1] } // ----- @@ -87,7 +87,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1] } + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] } // ----- @@ -127,7 +127,7 @@ ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation %gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } - transform.gpu.map_nested_forall_to_threads %gpuLaunch { blockDim = [32, 4, 1] } + transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1] } // ----- @@ -160,7 +160,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false } // ----- @@ -192,7 +192,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [32, 1, 1]} + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 1, 1] } // ----- @@ -228,7 +228,7 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } + transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false } // ----- @@ -236,29 +236,64 @@ !type = memref<2 x 32 x f32> !type1d = memref<32 x f32> +// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) floordiv 4)> +// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 4) floordiv 2)> + +// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 12)> +// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) floordiv 20)> +// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)> + // CHECK-LABEL: func.func @map_multi_level( func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { %one = arith.constant 1 : index - %c12 = arith.constant 12 : index + %c10 = arith.constant 10 : index %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index -// check that the thread level got distributed but not the warp level. -// CHECK-NOT: {mapping = #gpu.thread -// CHECK: {mapping = [#gpu.warp]} + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index + // CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index + // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index + // CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index + + // check that both the thread level and the warp level got distributed. + // CHECK-NOT: #gpu.thread + // CHECK-NOT: #gpu.warp %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { + // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x + // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y scf.forall (%i, %j) in (%c7, %c9) { - %4 = memref.load %x[%i, %j] : !type - %5 = memref.load %y[%i, %j] : !type - %6 = math.fma %alpha, %4, %5 : f32 - memref.store %6, %y[%i, %j] : !type - } { mapping = [#gpu.thread, #gpu.thread]} - scf.forall (%i) in (%c12) { + %4 = memref.load %x[%i, %j] : !type + %5 = memref.load %y[%i, %j] : !type + %6 = math.fma %alpha, %4, %5 : f32 + memref.store %6, %y[%i, %j] : !type + } { mapping = [#gpu.thread, #gpu.thread]} + + // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index + // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index + // CHECK: %[[COND:.*]] = arith.andi %[[CMPY]], %[[CMPX]] : i1 + // CHECK: scf.if %[[COND]] + scf.forall (%i) in (%c1) { %7 = memref.load %t[%i] : !type1d %8 = arith.addf %alpha, %7 : f32 memref.store %8, %t[%i] : !type1d } {mapping = [#gpu.warp] } + + // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index + // CHECK: scf.if %[[COND]] + scf.forall (%i, %j) in (%c10, %c2) { + %7 = memref.load %t[%i] : !type1d + %8 = arith.addf %alpha, %7 : f32 + memref.store %8, %t[%j] : !type1d + } {mapping = [#gpu.linear, #gpu.linear] } gpu.terminator } return %y : !type @@ -267,5 +302,6 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation - transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1] } + transform.gpu.map_nested_forall_to_threads %funcop + block_dims = [12, 11, 1] warp_dims = [2, 2, 1] }