diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h @@ -33,17 +33,32 @@ namespace transform { namespace gpu { +constexpr int64_t kWarpSize = 32; + +/// Helper type for functions that generate ids for the mapping of a +/// scf.forall. +using GpuIdBuilderFnType = llvm::function_ref( + RewriterBase &, scf::ForallOp, ArrayRef mappingDims)>; + +/// Helper struct for passing the mapping attributes and id generator to the +/// common forall rewriter. +struct GpuIdBuilder { + /// The mapping attributes targeted by this generator. + SmallVector mappingAttributes; + /// The constructor that builds the concrete IR for mapping ids. + GpuIdBuilderFnType idBuilder; +}; + /// Map the top level `scf.forall` op to GPU Thread Blocks. /// Mapping is one-to-one and the induction variables of `scf.forall` are -/// rewritten to gpu.block_id according to the thread_dim_apping attribute. +/// rewritten to gpu.block_id according to the thread_dim_mapping attribute. /// Dynamic, `scf.forall` trip counts are currently not supported. /// Dynamic block dim sizes are currently not supported. -DiagnosedSilenceableFailure mapForallToBlocksImpl( - RewriterBase &rewriter, TransformOpInterface transformOp, - scf::ForallOp forallOp, SmallVectorImpl &gridDims, - const ArrayRef &mappingAttributes, - function_ref &)> - blockIdGenerator); +DiagnosedSilenceableFailure +mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, + scf::ForallOp forallOp, + SmallVectorImpl &gridDims, + const GpuIdBuilder &gpuIdBuilder); /// Search `scf.forall` ops nested under `target` and map each such op to GPU /// threads. Mapping is one-to-one and the induction variables of `scf.forall` @@ -56,10 +71,7 @@ DiagnosedSilenceableFailure mapNestedForallToThreadsImpl( RewriterBase &rewriter, std::optional transformOp, Operation *target, const SmallVectorImpl &kernelBlockDims, - bool syncAfterDistribute, - const ArrayRef &threadMappingAttributes, - function_ref &)> - threadIdGenerator); + bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder); /// Find the unique top level scf::ForallOp within a given target op. DiagnosedSilenceableFailure diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -8,7 +8,9 @@ #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" #include "mlir/Dialect/PDL/IR/PDL.h" @@ -16,8 +18,10 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/MLIRContext.h" #include "mlir/IR/OpDefinition.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" @@ -27,6 +31,7 @@ using namespace mlir; using namespace mlir::gpu; using namespace mlir::transform; +using namespace mlir::transform::gpu; #define DEBUG_TYPE "gpu-transforms" @@ -35,58 +40,87 @@ namespace { -/// Helper type for functions that generate ids for the mapping of a scf.forall. -using IdGeneratorFnType = llvm::function_ref &)>; - -struct MappingToGpuHelper { - MappingToGpuHelper(SmallVector mappingAttributes, - IdGeneratorFnType idGenerator) - : mappingAttributes(mappingAttributes), idGenerator(idGenerator) {} +/// Return a flatten thread id for the workgroup with given sizes. +static OpFoldResult getLinearThreadId(RewriterBase &rewriter, Location loc) { + AffineExpr tx, ty, tz, BDX, BDY; + bindDims(rewriter.getContext(), tx, ty, tz); + bindSymbols(rewriter.getContext(), BDX, BDY); + IndexType indexType = rewriter.getIndexType(); + SmallVector threadsAndWorkGroups{ + rewriter.create(loc, indexType, Dimension::x).getResult(), + rewriter.create(loc, indexType, Dimension::y).getResult(), + rewriter.create(loc, indexType, Dimension::z).getResult()}; + threadsAndWorkGroups.push_back( + rewriter.create(loc, indexType, Dimension::x).getResult()); + threadsAndWorkGroups.push_back( + rewriter.create(loc, indexType, Dimension::y).getResult()); + return makeComposedFoldedAffineApply( + rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups); +} - SmallVector mappingAttributes; - IdGeneratorFnType idGenerator; +struct GpuBlockIdBuilder : public GpuIdBuilder { + + GpuBlockIdBuilder(MLIRContext *ctx) : GpuIdBuilder() { + mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX), + GPUBlockMappingAttr::get(ctx, Blocks::DimY), + GPUBlockMappingAttr::get(ctx, Blocks::DimZ)}, + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef mappingDims) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + IndexType indexType = rewriter.getIndexType(); + auto loc = forallOp->getLoc(); + return SmallVector{ + rewriter.create(loc, indexType, Dimension::x), + rewriter.create(loc, indexType, Dimension::y), + rewriter.create(loc, indexType, Dimension::z)}; + }; + } }; -struct MappingToGpuBlocksHelper : public MappingToGpuHelper { - - MappingToGpuBlocksHelper(MLIRContext *ctx) - : MappingToGpuHelper( - SmallVector{ - GPUBlockMappingAttr::get(ctx, Blocks::DimX), - GPUBlockMappingAttr::get(ctx, Blocks::DimY), - GPUBlockMappingAttr::get(ctx, Blocks::DimZ)}, - IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp, - SmallVectorImpl &ids) { - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(forallOp); - IndexType indexType = rewriter.getIndexType(); - auto loc = forallOp->getLoc(); - ids.assign( - {rewriter.create(loc, indexType, Dimension::x), - rewriter.create(loc, indexType, Dimension::y), - rewriter.create(loc, indexType, Dimension::z)}); - }}) {} +struct GpuThreadIdBuilder : public GpuIdBuilder { + GpuThreadIdBuilder(MLIRContext *ctx, SmallVector *idCaptures = nullptr) + : GpuIdBuilder() { + mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX), + GPUThreadMappingAttr::get(ctx, Threads::DimY), + GPUThreadMappingAttr::get(ctx, Threads::DimZ)}; + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef mappingDims) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + IndexType indexType = rewriter.getIndexType(); + auto loc = forallOp->getLoc(); + return SmallVector{ + rewriter.create(loc, indexType, Dimension::x), + rewriter.create(loc, indexType, Dimension::y), + rewriter.create(loc, indexType, Dimension::z)}; + }; + } }; -struct MappingToGpuThreadsHelper : public MappingToGpuHelper { - MappingToGpuThreadsHelper(MLIRContext *ctx) - : MappingToGpuHelper( - SmallVector{ - GPUThreadMappingAttr::get(ctx, Threads::DimX), - GPUThreadMappingAttr::get(ctx, Threads::DimY), - GPUThreadMappingAttr::get(ctx, Threads::DimZ)}, - IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp, - SmallVectorImpl &ids) { - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(forallOp); - IndexType indexType = rewriter.getIndexType(); - auto loc = forallOp->getLoc(); - ids.assign( - {rewriter.create(loc, indexType, Dimension::x), - rewriter.create(loc, indexType, Dimension::y), - rewriter.create(loc, indexType, Dimension::z)}); - }}) {} +struct GpuWarpIdBuilder : public GpuIdBuilder { + GpuWarpIdBuilder(MLIRContext *ctx, SmallVector *idCaptures = nullptr) + : GpuIdBuilder() { + mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX), + GPUWarpMappingAttr::get(ctx, Warps::DimY), + GPUWarpMappingAttr::get(ctx, Warps::DimZ)}; + idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp, + ArrayRef mappingDims) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(forallOp); + Location loc = forallOp.getLoc(); + Value warpId = rewriter.create(loc); + SmallVector reverseBlockDims(llvm::reverse(mappingDims)); + SmallVector strides = computeStrides(reverseBlockDims); + AffineExpr d0; + bindDims(rewriter.getContext(), d0); + SmallVector delinearizingExprs = delinearize(d0, strides); + SmallVector ids; + for (AffineExpr e : delinearizingExprs) + ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId)); + return ids; + }; + } }; } // namespace @@ -95,7 +129,7 @@ failureHelper(std::optional transformOp, scf::ForallOp forallOp, const Twine &message) { if (transformOp.has_value()) - return emitDefiniteFailure(*transformOp, message); + return transformOp->emitSilenceableError() << message; return emitDefiniteFailure(forallOp, message); } @@ -114,9 +148,14 @@ llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { return attr.isa(); }); + bool hasWarpMapping = + llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) { + return attr.isa(); + }); int64_t countMappingTypes = 0; countMappingTypes += hasBlockMapping ? 1 : 0; countMappingTypes += hasThreadMapping ? 1 : 0; + countMappingTypes += hasWarpMapping ? 1 : 0; if (countMappingTypes > 1) { return failureHelper(transformOp, forallOp, "cannot mix different mapping types, use nesting"); @@ -163,9 +202,9 @@ return DiagnosedSilenceableFailure::success(); } -/// Determines if the size of the kernel configuration is supported by the GPU -/// architecture being used. It presently makes use of CUDA limitations, however -/// that aspect may be enhanced for other GPUs. +/// Determines if the size of the kernel configuration is supported by the +/// GPU architecture being used. It presently makes use of CUDA limitations, +/// however that aspect may be enhanced for other GPUs. static DiagnosedSilenceableFailure checkGpuLimits( TransformOpInterface transformOp, std::optional gridDimX, std::optional gridDimY, std::optional gridDimZ, @@ -201,8 +240,8 @@ return DiagnosedSilenceableFailure::success(); } -/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and -/// put a terminator within. +/// Creates an empty-body gpu::LaunchOp using the provided kernel settings +/// and put a terminator within. static DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, LaunchOp &launchOp, @@ -278,21 +317,41 @@ return DiagnosedSilenceableFailure::success(); } +/// Struct to return the result of the rewrite of a forall operation. +struct ForallRewriteResult { + SmallVector mappingSizes; + SmallVector mappingIds; +}; + +/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR. +static void +replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, + ValueRange mappingIds, + ArrayRef availableMappingSizes) { + assert(!mappingIds.empty() && "expected some mapping"); + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(mappingIds.front().getDefiningOp()); + Value zero = rewriter.create(loc, 0); + for (auto [dim, id] : llvm::zip_equal(availableMappingSizes, mappingIds)) { + if (dim == 1) + rewriter.replaceAllUsesWith(id, zero); + } +} + //===----------------------------------------------------------------------===// // MapForallToBlocks //===----------------------------------------------------------------------===// -static FailureOr> rewriteOneForallCommonImpl( +static FailureOr rewriteOneForallCommonImpl( RewriterBase &rewriter, std::optional transformOp, scf::ForallOp forallOp, const SmallVectorImpl &availableMappingSizes, - const ArrayRef &allMappingAttributes, - IdGeneratorFnType idGenerator) { + const GpuIdBuilder &gpuIdBuilder) { LDBG("Start rewriteOneForallCommonImpl"); // Step 0. GPU-specific verifications. There is no better place to anchor - // those right now: the ForallOp is target-independent and the transform op - // does not apply to individual ForallOp. + // those right now: the ForallOp is target-independent and the transform + // op does not apply to individual ForallOp. DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp); if (!diag.succeeded()) return failure(); @@ -306,14 +365,15 @@ })); SmallVector forallMappings = llvm::to_vector(forallOp.getMapping()->getValue()); - for (auto attr : allMappingAttributes) { + for (auto attr : gpuIdBuilder.mappingAttributes) { if (llvm::is_contained(forallMappings, attr)) continue; forallMappings.push_back(attr); tmpMappingSizes.push_back(1); } - // Step 2. sort the values by the corresponding DeviceMappingAttrInterface. + // Step 2. sort the values by the corresponding + // DeviceMappingAttrInterface. auto comparator = [&](DeviceMappingAttrInterface a, DeviceMappingAttrInterface b) -> bool { return a.getMappingId() < b.getMappingId(); @@ -325,22 +385,13 @@ llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: "); llvm::dbgs() << "\n"); - // Step 3. Generate the mappingIdOps using the provided generator and map the - // induction variables to the newly created ops. Replace ids of dimension - // known to be of size 1 by zero to simplify the IR. - SmallVector mappingIdOps; - Location loc = forallOp.getLoc(); - idGenerator(rewriter, forallOp, mappingIdOps); + // Step 3. Generate the mappingIdOps using the provided generator and map + // the induction variables to the newly created ops. + SmallVector mappingIdOps = + gpuIdBuilder.idBuilder(rewriter, forallOp, mappingSizes); LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: "); llvm::dbgs() << "\n"); assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes"); - Value zero = rewriter.create(loc, 0); - if (!availableMappingSizes.empty()) { - for (size_t i : llvm::seq(size_t(0), availableMappingSizes.size())) { - if (availableMappingSizes[i] == 1) - mappingIdOps[i] = zero; - } - } IRMapping bvm; for (auto [iv, dim] : @@ -354,6 +405,7 @@ // Step 4. Maybe create conditionals to predicate the region. // Skip this step when availableMappingSizes is empty. + Location loc = forallOp.getLoc(); Value predicate; if (!availableMappingSizes.empty()) { LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes, @@ -389,12 +441,13 @@ Block::iterator insertionPoint; if (predicate) { // Step 5.a. If predicated, move at the beginning. - auto ifOp = - rewriter.create(loc, predicate, /*withElseRegion=*/false); + auto ifOp = rewriter.create(loc, predicate, + /*withElseRegion=*/false); targetBlock = ifOp.thenBlock(); insertionPoint = ifOp.thenBlock()->begin(); } else { - // Step 5.b. Otherwise, move inline just at the rewriter insertion point. + // Step 5.b. Otherwise, move inline just at the rewriter insertion + // point. targetBlock = forallOp->getBlock(); insertionPoint = rewriter.getInsertionPoint(); } @@ -411,23 +464,30 @@ // Step 7. Erase old op. rewriter.eraseOp(forallOp); - return mappingSizes; + return ForallRewriteResult{mappingSizes, mappingIdOps}; } DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl( RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl &gridDims, - const ArrayRef &allMappingAttributes, - IdGeneratorFnType idGenerator) { + const GpuIdBuilder &gpuIdBuilder) { + // Pass an empty anyAvailableMappingSizes. + Location loc = forallOp.getLoc(); SmallVector anyAvailableMappingSizes; - FailureOr> maybeMappingSizes = - rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, - anyAvailableMappingSizes, allMappingAttributes, - idGenerator); - if (failed(maybeMappingSizes)) + FailureOr rewriteResult = rewriteOneForallCommonImpl( + rewriter, transformOp, forallOp, anyAvailableMappingSizes, gpuIdBuilder); + + // Fail if anything goes wrong. + if (failed(rewriteResult)) return DiagnosedSilenceableFailure::definiteFailure(); - gridDims = *maybeMappingSizes; + gridDims = rewriteResult->mappingSizes; + + // Replace ids of dimensions known to be 1 by 0 to simplify the IR. + // Here, the result of mapping determines the available mapping sizes. + replaceUnitMappingIdsHelper(rewriter, loc, rewriteResult->mappingIds, + gridDims); + return DiagnosedSilenceableFailure::success(); } @@ -500,10 +560,9 @@ if (!diag.succeeded()) return diag; - MappingToGpuBlocksHelper helper(getContext()); + GpuBlockIdBuilder gpuBlockIdBuilder(getContext()); diag = mlir::transform::gpu::mapForallToBlocksImpl( - rewriter, transformOp, topLevelForallOp, gridDims, - helper.mappingAttributes, helper.idGenerator); + rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder); if (!diag.succeeded()) return diag; @@ -522,30 +581,36 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl( RewriterBase &rewriter, std::optional transformOp, Operation *target, const SmallVectorImpl &kernelBlockDims, - bool syncAfterDistribute, - const ArrayRef &allMappingAttributes, - IdGeneratorFnType idGenerator) { + bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) { DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success(); target->walk([&](scf::ForallOp forallOp) { - // Ignore cases with different attributes. + // Ignore cases with different attributes than this builder supports. for (Attribute map : forallOp.getMapping()->getValue()) { - if (!llvm::is_contained(allMappingAttributes, map)) { + if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) { return WalkResult::skip(); } } diag = verifyGpuMapping(transformOp, forallOp); if (diag.succeeded()) { - // Take the loc ahead of time Location loc = forallOp.getLoc(); OpBuilder::InsertionGuard g(rewriter); + // Insert after to allow for syncthreads after `forall` is erased. rewriter.setInsertionPointAfter(forallOp); - if (failed(rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, - kernelBlockDims, - allMappingAttributes, idGenerator))) + FailureOr rewriteResult = rewriteOneForallCommonImpl( + rewriter, transformOp, forallOp, kernelBlockDims, gpuIdBuilder); + + // Fail if anything goes wrong. + if (failed(rewriteResult)) diag = DiagnosedSilenceableFailure::definiteFailure(); + // Add a syncthreads if needed. TODO: warpsync if (syncAfterDistribute) rewriter.create(loc); + + // Replace ids of dimensions known to be 1 by 0 to simplify the IR. + // Here, the available mapping sizes are given by `kernelBlockDims`. + replaceUnitMappingIdsHelper(rewriter, loc, rewriteResult->mappingIds, + kernelBlockDims); } return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt(); }); @@ -575,10 +640,11 @@ MLIRContext *ctx = getContext(); IRRewriter rewriter(ctx); - MappingToGpuThreadsHelper helper(ctx); + SmallVector idCaptures; + GpuThreadIdBuilder gpuThreadIdBuilder(ctx, &idCaptures); diag = mlir::transform::gpu::mapNestedForallToThreadsImpl( rewriter, transformOp, target, blockDims, getSyncAfterDistribute(), - helper.mappingAttributes, helper.idGenerator); + gpuThreadIdBuilder); if (!diag.succeeded()) return diag;