diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md --- a/mlir/docs/Passes.md +++ b/mlir/docs/Passes.md @@ -4,295 +4,46 @@ [TOC] -## Affine control lowering (`-lower-affine`) +## General Transformation Passes -Convert operations related to affine control into a graph of blocks using -operations from the standard dialect. +[include "GeneralPasses.md"] -Loop statements are converted to a subgraph of blocks (initialization, condition -checking, subgraph of body blocks) with loop induction variable being passed as -the block argument of the condition checking block. Conditional statements are -converted to a subgraph of blocks (chain of condition checking with -short-circuit logic, subgraphs of 'then' and 'else' body blocks). `affine.apply` -operations are converted into sequences of primitive arithmetic operations that -have the same effect, using operands of the `index` type. Consequently, named -maps and sets may be removed from the module. +## Conversion Passes -For example, `%r = affine.apply (d0, d1)[s0] -> (d0 + 2*d1 + s0)(%d0, %d1)[%s0]` -can be converted into: +[include "ConversionPasses.md"] -```mlir -%d0 = <...> -%d1 = <...> -%s0 = <...> -%0 = constant 2 : index -%1 = muli %0, %d1 -%2 = addi %d0, %1 -%r = addi %2, %s0 -``` +## Quantizer Passes -### Input invariant +[include "QuantizerPasses.md"] -- no `Tensor` types; +## `affine` Dialect Passes -These restrictions may be lifted in the future. +[include "AffinePasses.md"] -### Output IR +## `fxpmath` Dialect Passes -Functions with `affine.for` and `affine.if` operations eliminated. These -functions may contain operations from the Standard dialect in addition to those -already present before the pass. +[include "FxpMathPasses.md"] -### Invariants +## `gpu` Dialect Passes -- Functions without a body are not modified. -- The semantics of the other functions is preserved. -- Individual operations other than those mentioned above are not modified if - they do not depend on the loop iterator value or on the result of - `affine.apply`. +[include "GPUPasses.md"] -## Conversion from Standard to LLVM IR dialect (`-convert-std-to-llvm`) +## `linalg` Dialect Passes -Convert standard operations into the LLVM IR dialect operations. +[include "LinalgPasses.md"] -### Input invariant +## `llvm` Dialect Passes -- operations including: arithmetic on integers and floats, constants, direct - calls, returns and branches; -- no `tensor` types; -- all `vector` are one-dimensional; -- all blocks are reachable by following the successors of the first basic - block; +[include "LLVMPasses.md"] -If other operations are present and their results are required by the LLVM IR -dialect operations, the pass will fail. Any LLVM IR operations or types already -present in the IR will be kept as is. +## `loop` Dialect Passes -### Output IR +[include "LoopPasses.md"] -Functions converted to LLVM IR. Function arguments types are converted -one-to-one. Function results are converted one-to-one and, in case more than 1 -value is returned, packed into an LLVM IR struct type. Function calls and -returns are updated accordingly. Block argument types are updated to use LLVM IR -types. +## `quant` Dialect Passes -## Data Copy DMA generation (`-affine-data-copy-generate`) +[include "QuantPasses.md"] -Replaces all loads and stores on memref's living in 'slowMemorySpace' by -introducing DMA operations (strided DMA if necessary) to transfer data to/from -`fastMemorySpace` and rewriting the original load's/store's to instead -load/store from the allocated fast memory buffers. Additional options specify -the identifier corresponding to the fast memory space and the amount of fast -memory space available. The pass traverses through the nesting structure, -recursing to inner levels if necessary to determine at what depth DMA transfers -need to be placed so that the allocated buffers fit within the memory capacity -provided. If this is not possible (for example, when the elemental type itself -is of size larger than the DMA capacity), an error with location information is -emitted. The DMA transfers are also hoisted up past all loops with respect to -which the transfers are invariant. +## `spv` Dialect Passes -Input - -```mlir -func @loop_nest_tiled() -> memref<256x1024xf32> { - %0 = alloc() : memref<256x1024xf32> - affine.for %i0 = 0 to 256 step 32 { - affine.for %i1 = 0 to 1024 step 32 { - affine.for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 32)(%i0) { - affine.for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) { - %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32> - } - } - } - } - return %0 : memref<256x1024xf32> -} -``` - -Output (with flags: -affine-data-copy-generate -affine-data-copy-generate-fast-mem-space=2) - -```mlir -module { - func @loop_nest_tiled() -> memref<256x1024xf32> { - %c262144 = constant 262144 : index - %c0 = constant 0 : index - %0 = alloc() : memref<256x1024xf32> - %1 = alloc() : memref<256x1024xf32, 2> - %2 = alloc() : memref<1xi32> - affine.dma_start %0[%c0, %c0], %1[%c0, %c0], %2[%c0], %c262144 : memref<256x1024xf32>, memref<256x1024xf32, 2>, memref<1xi32> - affine.dma_wait %2[%c0], %c262144 : memref<1xi32> - affine.for %arg0 = 0 to 256 step 32 { - affine.for %arg1 = 0 to 1024 step 32 { - affine.for %arg2 = #map1(%arg0) to #map2(%arg0) { - affine.for %arg3 = #map1(%arg1) to #map2(%arg1) { - %3 = affine.load %1[%arg2, %arg3] : memref<256x1024xf32, 2> - } - } - } - } - dealloc %2 : memref<1xi32> - dealloc %1 : memref<256x1024xf32, 2> - return %0 : memref<256x1024xf32> - } -} -``` - -## Loop tiling (`-affine-loop-tile`) - -Performs tiling or blocking of loop nests. It currently works on perfect loop -nests. - -## Loop unroll (`-affine-loop-unroll`) - -This pass implements loop unrolling. It is able to unroll loops with arbitrary -bounds, and generate a cleanup loop when necessary. - -## Loop unroll and jam (`-affine-loop-unroll-jam`) - -This pass implements unroll and jam for loops. It works on both perfect or -imperfect loop nests. - -## Loop fusion (`-affine-loop-fusion`) - -Performs fusion of loop nests using a slicing-based approach. The fused loop -nests, when possible, are rewritten to access significantly smaller local -buffers instead of the original memref's, and the latter are often -either completely optimized away or contracted. This transformation leads to -enhanced locality and lower memory footprint through the elimination or -contraction of temporaries / intermediate memref's. These benefits are sometimes -achieved at the expense of redundant computation through a cost model that -evaluates available choices such as the depth at which a source slice should be -materialized in the designation slice. - -## Memref bound checking (`-memref-bound-check`) - -Checks all load's and store's on memref's for out of bound accesses, and reports -any out of bound accesses (both overrun and underrun) with location information. - -```mlir -test/Transforms/memref-bound-check.mlir:19:13: error: 'load' op memref out of upper bound access along dimension #2 - %x = load %A[%idx0, %idx1] : memref<9 x 9 x i32> - ^ -test/Transforms/memref-bound-check.mlir:19:13: error: 'load' op memref out of lower bound access along dimension #2 - %x = load %A[%idx0, %idx1] : memref<9 x 9 x i32> - ^ -``` - -## Memref dataflow optimization (`-memref-dataflow-opt`) - -This pass performs store to load forwarding for memref's to eliminate memory -accesses and potentially the entire memref if all its accesses are forwarded. - -Input - -```mlir -func @store_load_affine_apply() -> memref<10x10xf32> { - %cf7 = constant 7.0 : f32 - %m = alloc() : memref<10x10xf32> - affine.for %i0 = 0 to 10 { - affine.for %i1 = 0 to 10 { - affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32> - %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32> - %v1 = addf %v0, %v0 : f32 - } - } - return %m : memref<10x10xf32> -} -``` - -Output - -```mlir -module { - func @store_load_affine_apply() -> memref<10x10xf32> { - %cst = constant 7.000000e+00 : f32 - %0 = alloc() : memref<10x10xf32> - affine.for %arg0 = 0 to 10 { - affine.for %arg1 = 0 to 10 { - affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32> - %1 = addf %cst, %cst : f32 - } - } - return %0 : memref<10x10xf32> - } -} - -``` - -## Memref dependence analysis (`-memref-dependence-check`) - -This pass performs dependence analysis to determine dependences between pairs of -memory operations (load's and store's) on memref's. Dependence analysis exploits -polyhedral information available (affine maps, expressions, and affine.apply -operations) to precisely represent dependences using affine constraints, while -also computing dependence vectors from them, where each component of the -dependence vector provides a lower and an upper bound on the dependence distance -along the corresponding dimension. - -```mlir -test/Transforms/memref-dataflow-opt.mlir:232:7: note: dependence from 2 to 1 at depth 1 = ([1, 1], [-inf, +inf]) - store %cf9, %m[%idx] : memref<10xf32> -``` - -## Pipeline data transfer (`-affine-pipeline-data-transfer`) - -This pass performs a transformation to overlap non-blocking DMA operations in a -loop with computations through double buffering. This is achieved by advancing -dma_start operations with respect to other operations. - -Input - -```mlir -func @pipelinedatatransfer() { - %0 = alloc() : memref<256xf32> - %1 = alloc() : memref<32xf32, 1> - %2 = alloc() : memref<1xf32> - %c0 = constant 0 : index - %c128 = constant 128 : index - affine.for %i0 = 0 to 8 { - affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32> - affine.dma_wait %2[%c0], %c128 : memref<1xf32> - %3 = affine.load %1[%i0] : memref<32xf32, 1> - %4 = "compute"(%3) : (f32) -> f32 - affine.store %4, %1[%i0] : memref<32xf32, 1> - } - return -} -``` - -Output - -```mlir -module { - func @pipelinedatatransfer() { - %c8 = constant 8 : index - %c0 = constant 0 : index - %0 = alloc() : memref<256xf32> - %c0_0 = constant 0 : index - %c128 = constant 128 : index - %1 = alloc() : memref<2x32xf32, 1> - %2 = alloc() : memref<2x1xf32> - affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> - affine.for %arg0 = 1 to 8 { - affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> - %8 = affine.apply #map3(%arg0) - %9 = affine.apply #map4(%8) - %10 = affine.apply #map4(%8) - affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> - %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1> - %12 = "compute"(%11) : (f32) -> f32 - affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1> - } - %3 = affine.apply #map3(%c8) - %4 = affine.apply #map4(%3) - %5 = affine.apply #map4(%3) - affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> - %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1> - %7 = "compute"(%6) : (f32) -> f32 - affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1> - dealloc %2 : memref<2x1xf32> - dealloc %1 : memref<2x32xf32, 1> - return - } -} -``` +[include "SPIRVPasses.md"] diff --git a/mlir/include/mlir/Conversion/CMakeLists.txt b/mlir/include/mlir/Conversion/CMakeLists.txt --- a/mlir/include/mlir/Conversion/CMakeLists.txt +++ b/mlir/include/mlir/Conversion/CMakeLists.txt @@ -2,3 +2,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRConversionPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc ConversionPasses ./) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -18,6 +18,54 @@ def ConvertAffineToStandard : Pass<"lower-affine"> { let summary = "Lower Affine operations to a combination of Standard and Loop " "operations"; + let description = [{ + Convert operations related to affine control into a graph of blocks using + operations from the standard dialect. + + Loop statements are converted to a subgraph of blocks (initialization, + condition checking, subgraph of body blocks) with loop induction variable + being passed as the block argument of the condition checking block. + Conditional statements are converted to a subgraph of blocks (chain of + condition checking with short-circuit logic, subgraphs of 'then' and 'else' + body blocks). `affine.apply` operations are converted into sequences of + primitive arithmetic operations that have the same effect, using operands of + the `index` type. Consequently, named maps and sets may be removed from the + module. + + For example, + `%r = affine.apply (d0, d1)[s0] -> (d0 + 2*d1 + s0)(%d0, %d1)[%s0]` + can be converted into: + + ```mlir + %d0 = <...> + %d1 = <...> + %s0 = <...> + %0 = constant 2 : index + %1 = muli %0, %d1 + %2 = addi %d0, %1 + %r = addi %2, %s0 + ``` + + #### Input invariant + + - no `Tensor` types; + + These restrictions may be lifted in the future. + + #### Output IR + + Functions with `affine.for` and `affine.if` operations eliminated. These + functions may contain operations from the Standard dialect in addition to + those already present before the pass. + + #### Invariants + + - Functions without a body are not modified. + - The semantics of the other functions is preserved. + - Individual operations other than those mentioned above are not modified + if they do not depend on the loop iterator value or on the result of + `affine.apply`. + }]; let constructor = "mlir::createLowerAffinePass()"; } @@ -152,6 +200,30 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm"> { let summary = "Convert scalar and vector operations from the Standard to the " "LLVM dialect"; + let description = [{ + Convert standard operations into the LLVM IR dialect operations. + + #### Input invariant + + - operations including: arithmetic on integers and floats, constants, + direct calls, returns and branches; + - no `tensor` types; + - all `vector` are one-dimensional; + - all blocks are reachable by following the successors of the first basic + block; + + If other operations are present and their results are required by the LLVM + IR dialect operations, the pass will fail. Any LLVM IR operations or types + already present in the IR will be kept as is. + + #### Output IR + + Functions converted to LLVM IR. Function arguments types are converted + one-to-one. Function results are converted one-to-one and, in case more than + 1 value is returned, packed into an LLVM IR struct type. Function calls and + returns are updated accordingly. Block argument types are updated to use + LLVM IR types. + }]; let constructor = "mlir::createLowerToLLVMPass()"; let options = [ Option<"useAlloca", "use-alloca", "bool", /*default=*/"false", diff --git a/mlir/include/mlir/Dialect/Affine/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt --- a/mlir/include/mlir/Dialect/Affine/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt @@ -3,3 +3,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRAffinePassIncGen) + +add_mlir_doc(Passes -gen-pass-doc AffinePasses ./) diff --git a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt --- a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt @@ -4,3 +4,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRFxpMathPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc FxpMathPasses ./) diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -4,3 +4,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRGPUPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc GPUPasses ./) diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt --- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt @@ -29,7 +29,8 @@ mlir_tablegen(LLVMAVX512Conversions.inc -gen-llvmir-conversions) add_public_tablegen_target(MLIRLLVMAVX512ConversionsIncGen) - set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRLLVMPassIncGen) + +add_mlir_doc(Transforms/Passes -gen-pass-doc LLVMPasses ./) diff --git a/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt --- a/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Linalg/CMakeLists.txt @@ -4,3 +4,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRLinalgPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc LinalgPasses ./) diff --git a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt --- a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt @@ -4,3 +4,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRLoopPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc LoopPasses ./) diff --git a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt --- a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt @@ -4,3 +4,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRQuantPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc QuantPasses ./) diff --git a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt --- a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt @@ -34,3 +34,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRSPIRVPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc SPIRVPasses ./) diff --git a/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt b/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt --- a/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt +++ b/mlir/include/mlir/Quantizer/Transforms/CMakeLists.txt @@ -2,3 +2,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRQuantizerPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc QuantizerPasses ./) diff --git a/mlir/include/mlir/Transforms/CMakeLists.txt b/mlir/include/mlir/Transforms/CMakeLists.txt --- a/mlir/include/mlir/Transforms/CMakeLists.txt +++ b/mlir/include/mlir/Transforms/CMakeLists.txt @@ -2,3 +2,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(MLIRTransformsPassIncGen) + +add_mlir_doc(Passes -gen-pass-doc GeneralPasses ./) diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -19,6 +19,68 @@ "affine-pipeline-data-transfer"> { let summary = "Pipeline non-blocking data transfers between explicitly " "managed levels of the memory hierarchy"; + let description = [{ + This pass performs a transformation to overlap non-blocking DMA operations + in a loop with computations through double buffering. This is achieved by + advancing dma_start operations with respect to other operations. + + Input + + ```mlir + func @pipelinedatatransfer() { + %0 = alloc() : memref<256xf32> + %1 = alloc() : memref<32xf32, 1> + %2 = alloc() : memref<1xf32> + %c0 = constant 0 : index + %c128 = constant 128 : index + affine.for %i0 = 0 to 8 { + affine.dma_start %0[%i0], %1[%i0], %2[%c0], %c128 : memref<256xf32>, memref<32xf32, 1>, memref<1xf32> + affine.dma_wait %2[%c0], %c128 : memref<1xf32> + %3 = affine.load %1[%i0] : memref<32xf32, 1> + %4 = "compute"(%3) : (f32) -> f32 + affine.store %4, %1[%i0] : memref<32xf32, 1> + } + return + } + ``` + + Output + + ```mlir + module { + func @pipelinedatatransfer() { + %c8 = constant 8 : index + %c0 = constant 0 : index + %0 = alloc() : memref<256xf32> + %c0_0 = constant 0 : index + %c128 = constant 128 : index + %1 = alloc() : memref<2x32xf32, 1> + %2 = alloc() : memref<2x1xf32> + affine.dma_start %0[%c0], %1[%c0 mod 2, %c0], %2[%c0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> + affine.for %arg0 = 1 to 8 { + affine.dma_start %0[%arg0], %1[%arg0 mod 2, %arg0], %2[%arg0 mod 2, symbol(%c0_0)], %c128 : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> + %8 = affine.apply #map3(%arg0) + %9 = affine.apply #map4(%8) + %10 = affine.apply #map4(%8) + affine.dma_wait %2[%8 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> + %11 = affine.load %1[%8 mod 2, %8] : memref<2x32xf32, 1> + %12 = "compute"(%11) : (f32) -> f32 + affine.store %12, %1[%8 mod 2, %8] : memref<2x32xf32, 1> + } + %3 = affine.apply #map3(%c8) + %4 = affine.apply #map4(%3) + %5 = affine.apply #map4(%3) + affine.dma_wait %2[%3 mod 2, symbol(%c0_0)], %c128 : memref<2x1xf32> + %6 = affine.load %1[%3 mod 2, %3] : memref<2x32xf32, 1> + %7 = "compute"(%6) : (f32) -> f32 + affine.store %7, %1[%3 mod 2, %3] : memref<2x32xf32, 1> + dealloc %2 : memref<2x1xf32> + dealloc %1 : memref<2x32xf32, 1> + return + } + } + ``` + }]; let constructor = "mlir::createPipelineDataTransferPass()"; } @@ -71,6 +133,46 @@ def MemRefDataFlowOpt : Pass<"memref-dataflow-opt"> { let summary = "Perform store/load forwarding for memrefs"; + let description = [{ + This pass performs store to load forwarding for memref's to eliminate memory + accesses and potentially the entire memref if all its accesses are + forwarded. + + Input + + ```mlir + func @store_load_affine_apply() -> memref<10x10xf32> { + %cf7 = constant 7.0 : f32 + %m = alloc() : memref<10x10xf32> + affine.for %i0 = 0 to 10 { + affine.for %i1 = 0 to 10 { + affine.store %cf7, %m[%i0, %i1] : memref<10x10xf32> + %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32> + %v1 = addf %v0, %v0 : f32 + } + } + return %m : memref<10x10xf32> + } + ``` + + Output + + ```mlir + module { + func @store_load_affine_apply() -> memref<10x10xf32> { + %cst = constant 7.000000e+00 : f32 + %0 = alloc() : memref<10x10xf32> + affine.for %arg0 = 0 to 10 { + affine.for %arg1 = 0 to 10 { + affine.store %cst, %0[%arg0, %arg1] : memref<10x10xf32> + %1 = addf %cst, %cst : f32 + } + } + return %0 : memref<10x10xf32> + } + } + ``` + }]; let constructor = "mlir::createMemRefDataFlowOptPass()"; } diff --git a/mlir/tools/mlir-tblgen/CMakeLists.txt b/mlir/tools/mlir-tblgen/CMakeLists.txt --- a/mlir/tools/mlir-tblgen/CMakeLists.txt +++ b/mlir/tools/mlir-tblgen/CMakeLists.txt @@ -14,6 +14,7 @@ OpFormatGen.cpp OpInterfacesGen.cpp PassGen.cpp + PassDocGen.cpp RewriterGen.cpp SPIRVUtilsGen.cpp StructsGen.cpp diff --git a/mlir/tools/mlir-tblgen/PassDocGen.cpp b/mlir/tools/mlir-tblgen/PassDocGen.cpp new file mode 100644 --- /dev/null +++ b/mlir/tools/mlir-tblgen/PassDocGen.cpp @@ -0,0 +1,80 @@ +//===- PassDocGen.cpp - MLIR pass documentation generator -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// PassDocGen uses the description of passes to generate documentation. +// +//===----------------------------------------------------------------------===// + +#include "DocGenUtilities.h" +#include "mlir/TableGen/GenInfo.h" +#include "mlir/TableGen/Pass.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/TableGen/Record.h" + +using namespace mlir; +using namespace mlir::tblgen; + +/// Emit the documentation for the given pass. +static void emitDoc(const Pass &pass, raw_ostream &os) { + os << llvm::formatv("### `-{0}`: {1}\n", pass.getArgument(), + pass.getSummary()); + emitDescription(pass.getDescription(), os); + + // Handle the options of the pass. + ArrayRef options = pass.getOptions(); + if (!options.empty()) { + os << "\n#### Options\n```\n"; + size_t longestOption = 0; + for (const PassOption &option : options) + longestOption = std::max(option.getArgument().size(), longestOption); + for (const PassOption &option : options) { + os << "-" << option.getArgument(); + os.indent(longestOption - option.getArgument().size()) + << " : " << option.getDescription() << "\n"; + } + os << "```\n"; + } + + // Handle the statistics of the pass. + ArrayRef stats = pass.getStatistics(); + if (!stats.empty()) { + os << "\n#### Statistics\n```\n"; + size_t longestStat = 0; + for (const PassStatistic &stat : stats) + longestStat = std::max(stat.getName().size(), longestStat); + for (const PassStatistic &stat : stats) { + os << stat.getName(); + os.indent(longestStat - stat.getName().size()) + << " : " << stat.getDescription() << "\n"; + } + os << "```\n"; + } +} + +static void emitDocs(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) { + os << "\n"; + auto passDefs = recordKeeper.getAllDerivedDefinitions("Pass"); + + // Collect the registered passes, sorted by argument name. + SmallVector passes(passDefs.begin(), passDefs.end()); + SmallVector sortedPasses(llvm::make_pointer_range(passes)); + llvm::array_pod_sort(sortedPasses.begin(), sortedPasses.end(), + [](Pass *const *lhs, Pass *const *rhs) { + return (*lhs)->getArgument().compare( + (*rhs)->getArgument()); + }); + for (Pass *pass : sortedPasses) + emitDoc(*pass, os); +} + +static mlir::GenRegistration + genRegister("gen-pass-doc", "Generate pass documentation", + [](const llvm::RecordKeeper &records, raw_ostream &os) { + emitDocs(records, os); + return false; + });