diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h @@ -0,0 +1,50 @@ +//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file declares the utilities to generate mappings for parallel +// loops to GPU devices. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H +#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H + +namespace mlir { + +struct Region; + +namespace gpu { + +/// Name of the mapping attribute produced by loop mappers. +static constexpr const char *kMappingAttributeName = "mapping"; +/// Name of the processor sub-attribute that identifies the hardware id +/// to map a loop to. +static constexpr const char *kProcessorEntryName = "processor"; +/// Name of the map sub-attribute that identifies the affine map to apply +/// to the hardware id to compute the iteration number of the loop. This +/// map is expected to be extended by step and lower bound computations: +/// index = map(hardware_id) * step + lowerbound +static constexpr const char *kIndexMapEntryName = "map"; +/// Name of the bound sub-attribute that itendities the affine map to +/// compute an upper bound of iterations for the hardware id. This is +/// applied to an upper bound on the number of iterations: +/// launchBound = bound(upperbound-lowerbound ceildiv step) +static constexpr const char *kBoundMapEntryName = "bound"; + +} // end namespace gpu + +/// Maps the parallel loops found in the given function to workgroups. The first +/// loop encountered will be mapped to the global workgroup and the second loop +/// encountered to the local workgroup. Within each mapping, the first three +/// dimensions are mapped to x/y/z hardware ids and all following dimensions are +/// mapped to sequential loops. +void greedilyMapParallelLoopsToGPU(Region ®ion); + +} // end namespace mlir + +#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td --- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td +++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td @@ -289,6 +289,9 @@ let extraClassDeclaration = [{ Block *getBody() { return ®ion().front(); } + unsigned getNumInductionVars() { + return getBody()->getNumArguments(); + } iterator_range getInductionVars() { return {getBody()->args_begin(), getBody()->args_end()}; } diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -17,6 +17,7 @@ #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/AffineOps/AffineOps.h" #include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/ParallelLoopMapper.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExpr.h" @@ -508,23 +509,20 @@ } // namespace -static constexpr const char *kProcessorEntryName = "processor"; -static constexpr const char *kIndexMapEntryName = "map"; -static constexpr const char *kBoundMapEntryName = "bound"; - /// Extracts the mapping annotations from the provided attribute. The attribute /// is expected to be of the form /// { processor = , map = , bound = } /// where the bound is optional. static MappingAnnotation extractMappingAnnotation(Attribute attribute) { DictionaryAttr dict = attribute.cast(); - unsigned processor = dict.get(kProcessorEntryName) + unsigned processor = dict.get(gpu::kProcessorEntryName) .cast() .getValue() .getSExtValue(); - AffineMap map = dict.get(kIndexMapEntryName).cast().getValue(); + AffineMap map = + dict.get(gpu::kIndexMapEntryName).cast().getValue(); AffineMapAttr boundAttr = - dict.get(kBoundMapEntryName).dyn_cast_or_null(); + dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null(); AffineMap bound; if (boundAttr) bound = boundAttr.getValue(); @@ -583,7 +581,8 @@ PatternRewriter &rewriter) { // TODO(herhut): Verify that this is a valid GPU mapping. // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential - ArrayAttr mapping = parallelOp.getAttrOfType("mapping"); + ArrayAttr mapping = + parallelOp.getAttrOfType(gpu::kMappingAttributeName); // TODO(herhut): Support reductions. if (!mapping || parallelOp.getNumResults() != 0) diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -3,6 +3,7 @@ Transforms/AllReduceLowering.cpp Transforms/KernelOutlining.cpp Transforms/MemoryPromotion.cpp + Transforms/ParallelLoopMapper.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -0,0 +1,89 @@ +//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utilities to generate mappings for parallel loops to +// GPU devices. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/ParallelLoopMapper.h" + +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::gpu; +using namespace mlir::loop; + +namespace { + +enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; + +static constexpr int kNumHardwareIds = 3; + +} // namespace + +/// Bounded increment on MappingLevel. Increments to the next +/// level unless Sequential was already reached. +MappingLevel &operator++(MappingLevel &mappingLevel) { + if (mappingLevel < Sequential) { + mappingLevel = static_cast(mappingLevel + 1); + } + return mappingLevel; +} + +/// Computed the hardware id to use for a given mapping level. Will +/// assign x,y and z hardware ids for the first 3 dimensions and use +/// sequential after. +static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { + if (dimension >= kNumHardwareIds || level == Sequential) + return Sequential * kNumHardwareIds; + return (level * kNumHardwareIds) + dimension; +} + +/// Add mapping information to the given parallel loop. Do not add +/// mapping information if the loop already has it. Also, don't +/// start a mapping at a nested loop. +static void mapParallelOp(ParallelOp parallelOp, + MappingLevel mappingLevel = MapGrid) { + // Do not try to add a mapping to already mapped loops or nested loops. + if (parallelOp.getAttr(gpu::kMappingAttributeName) || + ((mappingLevel == MapGrid) && parallelOp.getParentOfType())) + return; + + MLIRContext *ctx = parallelOp.getContext(); + Builder b(ctx); + SmallVector attrs; + attrs.reserve(parallelOp.getNumInductionVars()); + for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) { + SmallVector entries; + entries.emplace_back(b.getNamedAttr( + kProcessorEntryName, + b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i)))); + entries.emplace_back(b.getNamedAttr( + kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); + entries.emplace_back(b.getNamedAttr( + kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); + attrs.push_back(DictionaryAttr::get(entries, ctx)); + } + parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx)); + ++mappingLevel; + // Parallel loop operations are immediately nested, so do not use + // walk but just iterate over the operations. + for (Operation &op : *parallelOp.getBody()) { + if (ParallelOp nested = dyn_cast(op)) + mapParallelOp(nested, mappingLevel); + } +} + +void mlir::greedilyMapParallelLoopsToGPU(Region ®ion) { + region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); }); +} diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/GPU/mapping.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) + step (%one, %one) { + } + } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK: loop.parallel +// CHECK: loop.parallel +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]} +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]} +// CHECK-NOT: mapping + +// ----- + +func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3) + step (%four, %four, %four, %four) { + loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four) + step (%one, %one, %one, %one) { + loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four) + step (%one, %one, %one, %one) { + } + } + } + return +} + +// CHECK-LABEL: func @parallel_loop_4d( +// CHECK: loop.parallel +// CHECK: loop.parallel +// CHECK: loop.parallel +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]} +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]} +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]} +// CHECK-NOT: mapping diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -5,6 +5,7 @@ TestConstantFold.cpp TestLoopFusion.cpp TestGpuMemoryPromotion.cpp + TestGpuParallelLoopMapping.cpp TestInlining.cpp TestLinalgTransforms.cpp TestLiveness.cpp diff --git a/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp @@ -0,0 +1,38 @@ +//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the pass testing the utilities for mapping parallel +// loops to gpu hardware ids. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/ParallelLoopMapper.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +/// Simple pass for testing the mapping of parallel loops to hardware ids using +/// a greedy mapping stratgegy. +class TestGpuGreedyParallelLoopMappingPass + : public OperationPass { + void runOnOperation() override { + Operation *op = getOperation(); + for (Region ®ion : op->getRegions()) + greedilyMapParallelLoopsToGPU(region); + } +}; +} // end namespace + +namespace mlir { +void registerTestGpuParallelLoopMappingPass() { + PassRegistration registration( + "test-gpu-greedy-parallel-loop-mapping", + "Greedily maps all parallel loops to gpu hardware ids."); +} +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -50,6 +50,7 @@ void registerTestMemRefStrideCalculation(); void registerTestOpaqueLoc(); void registerTestParallelismDetection(); +void registerTestGpuParallelLoopMappingPass(); void registerTestVectorConversions(); void registerTestVectorToLoopsPass(); void registerVectorizerTestPass(); @@ -103,6 +104,7 @@ registerTestMemRefStrideCalculation(); registerTestOpaqueLoc(); registerTestParallelismDetection(); + registerTestGpuParallelLoopMappingPass(); registerTestVectorConversions(); registerTestVectorToLoopsPass(); registerVectorizerTestPass();