diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h @@ -0,0 +1,30 @@ +//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file declares the utilities to generate mappings for parallel +// loops to GPU devices. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H +#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H + +namespace mlir { + +struct FuncOp; + +/// Maps the parallel loops found in the given function to workgroups. The first +/// loop encountered will be mapped to the global workgroup and the second loop +/// encountered to the local workgroup. Within each mapping, the first three +/// dimensions are mapped to x/y/z hardware ids and all following dimensions are +/// mapped to sequential loops. +void greedilyMapParallelLoopsToGPU(FuncOp op); + +} // end namespace mlir + +#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H \ No newline at end of file diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td --- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td +++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td @@ -190,6 +190,9 @@ let extraClassDeclaration = [{ Block *getBody() { return ®ion().front(); } + unsigned getNumInductionVars() { + return getBody()->getNumArguments(); + } iterator_range getInductionVars() { return {getBody()->args_begin(), getBody()->args_end()}; } diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -3,6 +3,7 @@ Transforms/AllReduceLowering.cpp Transforms/KernelOutlining.cpp Transforms/MemoryPromotion.cpp + Transforms/ParallelLoopMapper.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -0,0 +1,88 @@ +//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utilities to generate mappings for parallel loops to +// GPU devices. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/ParallelLoopMapper.h" + +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::gpu; +using namespace mlir::loop; + +namespace { + +enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; + +static constexpr int kNumHardwareIds = 3; + +/// Bounded increment on MappingLevel. Increments to the next +/// level unless Sequential was already reached. +MappingLevel &operator++(MappingLevel &mappingLevel) { + if (mappingLevel < Sequential) { + mappingLevel = static_cast(mappingLevel + 1); + } + return mappingLevel; +} + +/// Computed the hardware id to use for a given mapping level. Will +/// assign x,y and z hardware ids for the first 3 dimensions and use +/// sequential after. +int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { + if (dimension >= kNumHardwareIds || level == Sequential) + return Sequential * kNumHardwareIds; + return (level * kNumHardwareIds) + dimension; +} + +/// Add mapping information to the given parallel loop. Do not add +/// mapping information if the loop already has it. Also, don't +/// start a mapping at a nested loop. +void mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid) { + // Do not try to add a mapping to already mapped loops or nested loops. + if (parallelOp.getAttr("mapping") || + ((mappingLevel == MapGrid) && parallelOp.getParentOfType())) + return; + + MLIRContext *ctx = parallelOp.getContext(); + Builder b(ctx); + SmallVector attrs; + attrs.reserve(parallelOp.getNumInductionVars()); + for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) { + SmallVector entries; + entries.emplace_back( + Identifier::get("processor", ctx), + b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))); + entries.emplace_back(Identifier::get("map", ctx), + AffineMapAttr::get(b.getDimIdentityMap())); + entries.emplace_back(Identifier::get("bound", ctx), + AffineMapAttr::get(b.getDimIdentityMap())); + attrs.push_back(DictionaryAttr::get(entries, ctx)); + } + parallelOp.setAttr("mapping", ArrayAttr::get(attrs, ctx)); + ++mappingLevel; + // Parallel loop operations are immediately nested, so do not use + // walk but just iterate over the operations. + for (Operation &op : *parallelOp.getBody()) { + if (ParallelOp nested = dyn_cast(op)) + mapParallelOp(nested, mappingLevel); + } +} + +} // namespace + +void mlir::greedilyMapParallelLoopsToGPU(FuncOp func) { + func.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); }); +} \ No newline at end of file diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/GPU/mapping.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four) + step (%one, %one) { + } + } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK: loop.parallel +// CHECK: loop.parallel +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]} +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]} +// CHECK-NOT: mapping + +// ----- + +func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3) + step (%four, %four, %four, %four) { + loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four) + step (%one, %one, %one, %one) { + loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four) + step (%one, %one, %one, %one) { + } + } + } + return +} + +// CHECK-LABEL: func @parallel_loop_4d( +// CHECK: loop.parallel +// CHECK: loop.parallel +// CHECK: loop.parallel +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]} +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]} +// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64}, +// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]} +// CHECK-NOT: mapping \ No newline at end of file diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -5,6 +5,7 @@ TestConstantFold.cpp TestLoopFusion.cpp TestGpuMemoryPromotion.cpp + TestGpuParallelLoopMapping.cpp TestInlining.cpp TestLinalgTransforms.cpp TestLiveness.cpp diff --git a/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestGpuParallelLoopMapping.cpp @@ -0,0 +1,36 @@ +//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the pass testing the utilities for mapping parallel +// loops to gpu hardware ids. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/ParallelLoopMapper.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +/// Simple pass for testing the mapping of parallel loops to hardware ids using +/// a greedy mapping stratgegy. +class TestGpuGreedyParallelLoopMappingPass + : public OperationPass { + void runOnOperation() override { + greedilyMapParallelLoopsToGPU(getOperation()); + } +}; +} // end namespace + +namespace mlir { +void registerTestGpuParallelLoopMappingPass() { + PassRegistration registration( + "test-gpu-greedy-parallel-loop-mapping", + "Greedily maps all parallel loops to gpu hardware ids."); +} +} // namespace mlir \ No newline at end of file diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -49,6 +49,7 @@ void registerTestMemRefStrideCalculation(); void registerTestOpaqueLoc(); void registerTestParallelismDetection(); +void registerTestGpuParallelLoopMappingPass(); void registerTestVectorConversions(); void registerTestVectorToLoopsPass(); void registerVectorizerTestPass(); @@ -101,6 +102,7 @@ registerTestMemRefStrideCalculation(); registerTestOpaqueLoc(); registerTestParallelismDetection(); + registerTestGpuParallelLoopMappingPass(); registerTestVectorConversions(); registerTestVectorToLoopsPass(); registerVectorizerTestPass();