diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1 +1,11 @@ add_mlir_dialect(GPUOps GPUOps) + +set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) +mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls) +mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs) +add_public_tablegen_target(MLIRParallelLoopMapperAttrGen) + +set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) +mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls) +mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs) +add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen) diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -0,0 +1,26 @@ +//===-- GPUBase.td - GPU dialect definitions ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the GPU dialect +// +//===----------------------------------------------------------------------===// + +#ifndef GPU_BASE +#define GPU_BASE + +include "mlir/IR/OpBase.td" + +//===----------------------------------------------------------------------===// +// GPU Dialect. +//===----------------------------------------------------------------------===// + +def GPU_Dialect : Dialect { + let name = "gpu"; +} + +#endif // GPU_BASE \ No newline at end of file diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -13,6 +13,7 @@ #ifndef GPU_OPS #define GPU_OPS +include "mlir/Dialect/GPU/GPUBase.td" include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Interfaces/SideEffects.td" @@ -26,10 +27,6 @@ // GPU Dialect operations. //===----------------------------------------------------------------------===// -def GPU_Dialect : Dialect { - let name = "gpu"; -} - class GPU_Op traits = []> : Op; diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h --- a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h @@ -14,28 +14,44 @@ #ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H #define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H +#include "mlir/IR/Attributes.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/DenseMap.h" + +#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc" + namespace mlir { +class AffineMap; +struct LogicalResult; +class Operation; class Region; +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc" + namespace gpu { /// Name of the mapping attribute produced by loop mappers. -static constexpr const char *kMappingAttributeName = "mapping"; -/// Name of the processor sub-attribute that identifies the hardware id -/// to map a loop to. -static constexpr const char *kProcessorEntryName = "processor"; -/// Name of the map sub-attribute that identifies the affine map to apply -/// to the hardware id to compute the iteration number of the loop. This -/// map is expected to be extended by step and lower bound computations: -/// index = map(hardware_id) * step + lowerbound -static constexpr const char *kIndexMapEntryName = "map"; -/// Name of the bound sub-attribute that itendities the affine map to -/// compute an upper bound of iterations for the hardware id. This is -/// applied to an upper bound on the number of iterations: -/// launchBound = bound(upperbound-lowerbound ceildiv step) -static constexpr const char *kBoundMapEntryName = "bound"; +StringRef getMappingAttrName(); +/// Get the value of the processor in the ParallelLoopDimMapper attribute. +inline Processor getProcessor(ParallelLoopDimMapper attr) { + return static_cast(attr.processor().getInt()); +} + +/// Helper function to create a ParallelDimMapperAttr. +/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method. +ParallelLoopDimMapper getParallelLoopDimMapperAttr(Processor processor, + AffineMap map, + AffineMap bound); + +/// Sets the mapping attribute of a loop.parallel operation. Verifies that the +/// mapping passed is valid. +/// - the number of DimMapperAttr provided is same as the number of loops of +/// the ploopOp. +/// - the mapping does not map multiple loops to the same processor. +LogicalResult setMappingAttr(Operation *op, + ArrayRef mapping); } // end namespace gpu /// Maps the parallel loops found in the given function to workgroups. The first @@ -46,5 +62,4 @@ void greedilyMapParallelLoopsToGPU(Region ®ion); } // end namespace mlir - #endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td @@ -0,0 +1,51 @@ +//===-- ParallelLoopMapperAttr.td - Attribute definition ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the attribute used for driving conversion from loop.parallel to +// gpu.launch operations +// +//===----------------------------------------------------------------------===// + +#ifndef PARALLEL_LOOP_MAPPER_ATTR +#define PARALLEL_LOOP_MAPPER_ATTR + +include "mlir/Dialect/AffineOps/AffineOpsBase.td" +include "mlir/Dialect/GPU/GPUBase.td" + +def BLOCKX : I64EnumAttrCase<"BLOCKX", 0>; +def BLOCKY : I64EnumAttrCase<"BLOCKY", 1>; +def BLOCKZ : I64EnumAttrCase<"BLOCKZ", 2>; +def THREADX : I64EnumAttrCase<"THREADX", 3>; +def THREADY : I64EnumAttrCase<"THREADY", 4>; +def THREADZ : I64EnumAttrCase<"THREADZ", 5>; +def SEQUENTIAL : I64EnumAttrCase<"SEQUENTIAL", 6>; + +def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [ + BLOCKX, BLOCKY, BLOCKZ, THREADX, THREADY, THREADZ, SEQUENTIAL]> { + let cppNamespace = "::mlir::gpu"; +} + +// Attribute that drives conversion of a loop.parallel to gpu.launch +// operation. +// processor: the hardware id to map to. +// map : An affine map that is used to pre-process hardware ids before +// substitution. +// bound : An affine map that is used to compute the bound of the hardware +// id based on an upper bound of the number of iterations. +def ParallelLoopDimMapperAttr : + StructAttr<"ParallelLoopDimMapper", GPU_Dialect, + [StructFieldAttr<"processor", ProcessorAttr>, + StructFieldAttr<"map", AffineMapAttr>, + StructFieldAttr<"bound", AffineMapAttr>]>; + + +def ParallelLoopMapperAttr : + TypedArrayAttrBase; + +#endif // PARALLEL_LOOP_MAPPER_ATTR diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -500,35 +500,8 @@ PatternMatchResult matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const override; }; - -struct MappingAnnotation { - unsigned processor; - AffineMap indexMap; - AffineMap boundMap; -}; - } // namespace -/// Extracts the mapping annotations from the provided attribute. The attribute -/// is expected to be of the form -/// { processor = , map = , bound = } -/// where the bound is optional. -static MappingAnnotation extractMappingAnnotation(Attribute attribute) { - DictionaryAttr dict = attribute.cast(); - unsigned processor = dict.get(gpu::kProcessorEntryName) - .cast() - .getValue() - .getSExtValue(); - AffineMap map = - dict.get(gpu::kIndexMapEntryName).cast().getValue(); - AffineMapAttr boundAttr = - dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null(); - AffineMap bound; - if (boundAttr) - bound = boundAttr.getValue(); - return {processor, map, bound}; -} - /// Tries to derive a static upper bound from the defining operation of /// `upperBound`. static Value deriveStaticUpperBound(Value upperBound, @@ -546,6 +519,30 @@ return {}; } +static bool isMappedToProcessor(gpu::Processor processor) { + return processor != gpu::Processor::SEQUENTIAL; +} + +static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { + switch (processor) { + case gpu::Processor::BLOCKX: + return 0; + case gpu::Processor::BLOCKY: + return 1; + case gpu::Processor::BLOCKZ: + return 2; + case gpu::Processor::THREADX: + return 3; + case gpu::Processor::THREADY: + return 4; + case gpu::Processor::THREADZ: + return 5; + default:; + } + llvm_unreachable( + "invalid processor type while retrieving launch op argument number"); +} + /// Modifies the current transformation state to capture the effect of the given /// `loop.parallel` operation on index substitutions and the operations to be /// inserted. @@ -568,16 +565,14 @@ /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the /// worklist. This signals the processor of the worklist to pop the rewriter /// one scope-level up. -static LogicalResult processParallelLoop(ParallelOp parallelOp, - gpu::LaunchOp launchOp, - BlockAndValueMapping &cloningMap, - SmallVectorImpl &worklist, - DenseMap &bounds, - PatternRewriter &rewriter) { +static LogicalResult processParallelLoop( + ParallelOp parallelOp, gpu::LaunchOp launchOp, + BlockAndValueMapping &cloningMap, SmallVectorImpl &worklist, + DenseMap &bounds, PatternRewriter &rewriter) { // TODO(herhut): Verify that this is a valid GPU mapping. // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential ArrayAttr mapping = - parallelOp.getAttrOfType(gpu::kMappingAttributeName); + parallelOp.getAttrOfType(gpu::getMappingAttrName()); // TODO(herhut): Support reductions. if (!mapping || parallelOp.getNumResults() != 0) @@ -604,12 +599,17 @@ Attribute mappingAttribute; Value iv, lowerBound, upperBound, step; std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; - MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute); + auto annotation = mappingAttribute.dyn_cast(); + if (!annotation) + return parallelOp.emitOpError() + << "expected mapping attribute for lowering to GPU"; Value newIndex; + gpu::Processor processor = gpu::getProcessor(annotation); - if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) { + if (isMappedToProcessor(processor)) { // Use the corresponding thread/grid index as replacement for the loop iv. - Value operand = launchOp.body().front().getArgument(annotation.processor); + Value operand = launchOp.body().front().getArgument( + getLaunchOpArgumentNum(processor)); // Take the indexmap and add the lower bound and step computations in. // This computes operand * step + lowerBound. // Use an affine map here so that it composes nicely with the provided @@ -619,11 +619,11 @@ rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1)); newIndex = rewriter.create( - loc, annotation.indexMap.compose(lowerAndStep), + loc, annotation.map().getValue().compose(lowerAndStep), ValueRange{operand, step, lowerBound}); // If there was also a bound, insert that, too. // TODO(herhut): Check that we do not assign bounds twice. - if (annotation.boundMap) { + if (annotation.bound().getValue()) { // We pass as the single opererand to the bound-map the number of // iterations, which is (upperBound - lowerBound) ceilDiv step. To // support inner loops with dynamic upper bounds (as generated by e.g. @@ -663,19 +663,19 @@ rewriter.getAffineSymbolExpr(1)) .ceilDiv(rewriter.getAffineSymbolExpr(2)))); Value launchBound = rewriter.create( - loc, annotation.boundMap.compose(stepMap), + loc, annotation.bound().getValue().compose(stepMap), ValueRange{ ensureLaunchIndependent( cloningMap.lookupOrDefault(upperBound)), ensureLaunchIndependent( cloningMap.lookupOrDefault(lowerBound)), ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); - if (bounds.find(annotation.processor) != bounds.end()) { + if (bounds.find(processor) != bounds.end()) { return parallelOp.emitOpError() << "cannot redefine the bound for processor " - << annotation.processor; + << stringifyProcessor(processor); } - bounds[annotation.processor] = launchBound; + bounds[processor] = launchBound; } if (!boundIsPrecise) { // We are using an approximation, create a surrounding conditional. @@ -757,7 +757,7 @@ rewriter.setInsertionPointToStart(&launchOp.body().front()); BlockAndValueMapping cloningMap; - llvm::DenseMap launchBounds; + llvm::DenseMap launchBounds; SmallVector worklist; if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, launchBounds, rewriter))) @@ -809,7 +809,8 @@ // Now that we succeeded creating the launch operation, also update the // bounds. for (auto bound : launchBounds) - launchOp.setOperand(std::get<0>(bound), std::get<1>(bound)); + launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), + std::get<1>(bound)); rewriter.eraseOp(parallelOp); return matchSuccess(); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -10,6 +10,8 @@ DEPENDS MLIRGPUOpsIncGen + MLIRParallelLoopMapperAttrGen + MLIRParallelLoopMapperEnumsGen ) target_link_libraries(MLIRGPU PUBLIC diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -23,6 +23,43 @@ using namespace mlir::gpu; using namespace mlir::loop; +#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc" +namespace mlir { + +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc" +namespace gpu { + +StringRef getMappingAttrName() { return "mapping"; } + +ParallelLoopDimMapper getParallelLoopDimMapperAttr(Processor processor, + AffineMap map, + AffineMap bound) { + MLIRContext *context = map.getContext(); + OpBuilder builder(context); + return ParallelLoopDimMapper::get( + builder.getI64IntegerAttr(static_cast(processor)), + AffineMapAttr::get(map), AffineMapAttr::get(bound), context); +} + +LogicalResult setMappingAttr(Operation *op, + ArrayRef mapping) { + // Verify that each processor is mapped to only once. + llvm::DenseSet specifiedMappings; + for (auto dimAttr : mapping) { + gpu::Processor processor = + static_cast(dimAttr.processor().getInt()); + if (processor != gpu::Processor::SEQUENTIAL && + specifiedMappings.count(processor)) + return op->emitError("invalid mapping multiple loops to same processor"); + } + ArrayRef mappingAsAttrs(mapping.data(), mapping.size()); + op->setAttr(getMappingAttrName(), + ArrayAttr::get(mappingAsAttrs, op->getContext())); + return success(); +} +} // namespace gpu +} // namespace mlir + namespace { enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; @@ -43,10 +80,41 @@ /// Computed the hardware id to use for a given mapping level. Will /// assign x,y and z hardware ids for the first 3 dimensions and use /// sequential after. -static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { +/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is +/// distributed to map to x, the next innermost to y and the next innermost to +/// z. +static gpu::Processor getHardwareIdForMapping(MappingLevel level, + int dimension) { + if (dimension >= kNumHardwareIds || level == Sequential) - return Sequential * kNumHardwareIds; - return (level * kNumHardwareIds) + dimension; + return Processor::SEQUENTIAL; + switch (level) { + case MapGrid: + switch (dimension) { + case 0: + return Processor::BLOCKX; + case 1: + return Processor::BLOCKY; + case 2: + return Processor::BLOCKZ; + default: + return Processor::SEQUENTIAL; + } + break; + case MapBlock: + switch (dimension) { + case 0: + return Processor::THREADX; + case 1: + return Processor::THREADY; + case 2: + return Processor::THREADZ; + default: + return Processor::SEQUENTIAL; + } + default:; + } + return Processor::SEQUENTIAL; } /// Add mapping information to the given parallel loop. Do not add @@ -55,26 +123,20 @@ static void mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid) { // Do not try to add a mapping to already mapped loops or nested loops. - if (parallelOp.getAttr(gpu::kMappingAttributeName) || + if (parallelOp.getAttr(getMappingAttrName()) || ((mappingLevel == MapGrid) && parallelOp.getParentOfType())) return; MLIRContext *ctx = parallelOp.getContext(); Builder b(ctx); - SmallVector attrs; + SmallVector attrs; attrs.reserve(parallelOp.getNumInductionVars()); for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) { - SmallVector entries; - entries.emplace_back(b.getNamedAttr( - kProcessorEntryName, - b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i)))); - entries.emplace_back(b.getNamedAttr( - kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); - entries.emplace_back(b.getNamedAttr( - kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); - attrs.push_back(DictionaryAttr::get(entries, ctx)); + attrs.push_back(getParallelLoopDimMapperAttr( + getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(), + b.getDimIdentityMap())); } - parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx)); + setMappingAttr(parallelOp, attrs); ++mappingLevel; // Parallel loop operations are immediately nested, so do not use // walk but just iterate over the operations. diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir --- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -3,7 +3,7 @@ // 2-d parallel loop mapped to block.y and block.x func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index, - %arg3 : index, %arg4 : index, + %arg3 : index, %arg4 : index, %buf : memref, %res : memref) { %step = constant 2 : index @@ -309,7 +309,7 @@ %buf : memref, %res : memref) { %four = constant 4 : index - // expected-error@+2 {{cannot redefine the bound for processor 1}} + // expected-error@+2 {{cannot redefine the bound for processor BLOCKY}} // expected-error@+1 {{failed to legalize operation 'loop.parallel'}} loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%four, %four) { @@ -334,7 +334,7 @@ // expected-error@+1 {{failed to legalize operation 'loop.parallel'}} loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%four, %four) { - // expected-error@+1 {{cannot derive loop-invariant upper bound}} + // expected-error@+1 {{cannot derive loop-invariant upper bound}} loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1) step (%one, %one) { %idx0 = addi %i0, %si0 : index