diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h @@ -69,6 +69,10 @@ void populatePropagateWarpVectorDistributionPatterns( RewritePatternSet &pattern); +/// Collect patterns to distribute vector reduction ops using GPU warp shuffle +/// ops. +void populateReductionToGPUWarpShufflePatterns(RewritePatternSet &pattern); + } // namespace vector } // namespace mlir #endif // MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORDISTRIBUTION_H_ diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt --- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt @@ -25,6 +25,7 @@ MLIRBufferizationDialect MLIRBufferizationTransforms MLIRDialectUtils + MLIRGPUOps MLIRIR MLIRLinalgDialect MLIRMemRefDialect diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -8,9 +8,11 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" +#include "mlir/Dialect/Vector/Utils/VectorUtils.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/Transforms/SideEffectUtils.h" @@ -722,6 +724,93 @@ } }; +/// A pattern that extracts vector.reduction ops from a WarpExecuteOnLane0Op. +/// The vector is reduced in parallel. Currently limited to vector<32x...> +/// values. Every lane reduces two scalars, 5 times in a row. +/// E.g.: +/// ``` +/// %r = vector_ext.warp_execute_on_lane_0(%laneid) -> (f32) { +/// %0 = "some_def"() : () -> (vector<32xf32>) +/// %1 = vector.reduction "add", %0 : vector<32xf32> into f32 +/// vector_ext.yield %1 : f32 +/// } +/// ``` +/// is lowered to: +/// ``` +/// %0 = vector_ext.warp_execute_on_lane_0(%laneid) -> (vector<1xf32>) { +/// %1 = "some_def"() : () -> (vector<32xf32>) +/// vector_ext.yield %1 : vector<32xf32> +/// } +/// %a = vector.extract %0[0] : vector<1xf32> +/// %r0, %s0 = gpu.shuffle xor %e, %c1, %c32 : f32 +/// %a0 = arith.addf %a, %r0 : f32 +/// %r1, %s1 = gpu.shuffle xor %a0, %c2, %c32 : f32 +/// %a1 = arith.addf %a0, %r1 : f32 +/// ... +/// %r4, %s4 = gpu.shuffle xor %a3, %c16, %c32 : f32 +/// %r = arith.addf %a3, %r4 : f32 +/// ``` +struct ReductionToGPUWarpShuffle + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *yieldOperand = getWarpResult( + warpOp, [](Operation *op) { return isa(op); }); + if (!yieldOperand) + return failure(); + + auto reductionOp = + cast(yieldOperand->get().getDefiningOp()); + auto vectorType = reductionOp.getVector().getType().cast(); + // Only rank 1 vectors supported. + if (vectorType.getRank() != 1) + return rewriter.notifyMatchFailure( + warpOp, "Only rank 1 reductions can be distributed."); + // Only warp_size-sized vectors supported. + if (static_cast(vectorType.getShape()[0]) != warpOp.getWarpSize()) + return rewriter.notifyMatchFailure( + warpOp, "Reduction vector dimension must match was size."); + // Only f32 and i32 element types are supported. + if (!reductionOp.getType().isF32() && + !reductionOp.getType().isSignlessInteger(32)) + return rewriter.notifyMatchFailure( + warpOp, + "Reduction distribution currently only supports 32bits types."); + + Location yieldLoc = yieldOperand->getOwner()->getLoc(); + + // Return vector that will be reduced from the WarpExecuteOnLane0Op. + unsigned operandIndex = yieldOperand->getOperandNumber(); + SmallVector yieldValues = {reductionOp.getVector()}; + SmallVector retTypes = {VectorType::get({1}, reductionOp.getType())}; + unsigned numResults = warpOp.getNumResults(); + WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, yieldValues, retTypes); + rewriter.setInsertionPointAfter(newWarpOp); + + // Every lane has one scalar value. These should be reduced. + Value laneValVec = newWarpOp.getResult(numResults); + Value laneVal = rewriter.create(yieldLoc, laneValVec, 0); + + // Parallel reduction using butterfly shuffles. + for (uint64_t i = 1; i < newWarpOp.getWarpSize(); i <<= 1) { + Value shuffled = + rewriter + .create(reductionOp.getLoc(), laneVal, i, + /*width=*/newWarpOp.getWarpSize(), + /*mode=*/gpu::ShuffleMode::XOR) + .result(); + laneVal = makeArithReduction(rewriter, reductionOp.getLoc(), + reductionOp.getKind(), laneVal, shuffled); + } + + newWarpOp.getResult(operandIndex).replaceAllUsesWith(laneVal); + return success(); + } +}; + } // namespace void mlir::vector::populateWarpExecuteOnLane0OpToScfForPattern( @@ -742,6 +831,11 @@ patterns.getContext()); } +void mlir::vector::populateReductionToGPUWarpShufflePatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + void mlir::vector::moveScalarUniformCode(WarpExecuteOnLane0Op warpOp) { Block *body = warpOp.getBody(); diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -434,3 +434,37 @@ "some_use"(%0#0) : (vector<1xf32>) -> () return } + +// ----- + +// CHECK-PROP-LABEL: func @vector_reduction( +// CHECK-PROP-SAME: %[[laneid:.*]]: index) +// CHECK-PROP-DAG: %[[c1:.*]] = arith.constant 1 : i32 +// CHECK-PROP-DAG: %[[c2:.*]] = arith.constant 2 : i32 +// CHECK-PROP-DAG: %[[c4:.*]] = arith.constant 4 : i32 +// CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 +// CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 +// CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 +// CHECK-PROP: %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) { +// CHECK-PROP: vector.yield %{{.*}} : vector<32xf32> +// CHECK-PROP: } +// CHECK-PROP: %[[a:.*]] = vector.extract %[[warp_op]][0] : vector<1xf32> +// CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] +// CHECK-PROP: %[[a0:.*]] = arith.addf %[[a]], %[[r0]] +// CHECK-PROP: %[[r1:.*]], %{{.*}} = gpu.shuffle xor %[[a0]], %[[c2]], %[[c32]] +// CHECK-PROP: %[[a1:.*]] = arith.addf %[[a0]], %[[r1]] +// CHECK-PROP: %[[r2:.*]], %{{.*}} = gpu.shuffle xor %[[a1]], %[[c4]], %[[c32]] +// CHECK-PROP: %[[a2:.*]] = arith.addf %[[a1]], %[[r2]] +// CHECK-PROP: %[[r3:.*]], %{{.*}} = gpu.shuffle xor %[[a2]], %[[c8]], %[[c32]] +// CHECK-PROP: %[[a3:.*]] = arith.addf %[[a2]], %[[r3]] +// CHECK-PROP: %[[r4:.*]], %{{.*}} = gpu.shuffle xor %[[a3]], %[[c16]], %[[c32]] +// CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] +// CHECK-PROP: return %[[a4]] : f32 +func.func @vector_reduction(%laneid: index) -> (f32) { + %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %0 = "some_def"() : () -> (vector<32xf32>) + %1 = vector.reduction , %0 : vector<32xf32> into f32 + vector.yield %1 : f32 + } + return %r : f32 +} diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir @@ -0,0 +1,69 @@ +// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\ +// RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\ +// RUN: mlir-opt -lower-affine -convert-scf-to-cf -convert-vector-to-llvm \ +// RUN: -convert-arith-to-llvm -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin)' \ +// RUN: -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_cuda_runtime%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// Run a tiled reduction fused with an elementwise op. + +func.func @gpu_func(%in: memref<1024xf32>, %out: memref<1xf32>) { + %c1 = arith.constant 1 : index + %cst = arith.constant dense<100.0000> : vector<1xf32> + %cst_0 = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c32 = arith.constant 32 : index + gpu.launch blocks(%arg3, %arg4, %arg5) + in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) + threads(%arg6, %arg7, %arg8) in (%arg12 = %c32, %arg13 = %c1, %arg14 = %c1) { + vector.warp_execute_on_lane_0(%arg6)[32] { + %init = vector.transfer_read %out[%c0], %cst_0 {in_bounds = [true]} : memref<1xf32>, vector<1xf32> + %13 = scf.for %arg0 = %c0 to %c1024 step %c32 iter_args(%arg1 = %init) -> (vector<1xf32>) { + %20 = vector.transfer_read %in[%arg0], %cst_0 {in_bounds = [true]} : memref<1024xf32>, vector<32xf32> + %21 = vector.reduction , %20 : vector<32xf32> into f32 + %22 = vector.broadcast %21 : f32 to vector<1xf32> + %23 = arith.addf %22, %arg1 : vector<1xf32> + scf.yield %23 : vector<1xf32> + } + %14 = arith.divf %13, %cst : vector<1xf32> + vector.transfer_write %14, %out[%c0] {in_bounds = [true]} : vector<1xf32>, memref<1xf32> + } + gpu.terminator + } + return +} +func.func @main() { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %0 = memref.alloc() : memref<1024xf32> + %1 = memref.alloc() : memref<1xf32> + %cst_1 = arith.constant dense<[ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, + 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]> : vector<32xf32> + %cst_2 = arith.constant dense<2.000000e+00> : vector<1xf32> + // init the buffers. + scf.for %i = %c0 to %c1024 step %c32 { + vector.transfer_write %cst_1, %0[%i] {in_bounds = [true]} : vector<32xf32>, memref<1024xf32> + } + vector.transfer_write %cst_2, %1[%c0] {in_bounds = [true]} : vector<1xf32>, memref<1xf32> + %3 = memref.cast %0 : memref<1024xf32> to memref<*xf32> + gpu.host_register %3 : memref<*xf32> + %5 = memref.cast %1 : memref<1xf32> to memref<*xf32> + gpu.host_register %5 : memref<*xf32> + call @gpu_func(%0, %1) : (memref<1024xf32>, memref<1xf32>) -> () + %6 = vector.transfer_read %1[%c0], %cst : memref<1xf32>, vector<1xf32> + vector.print %6 : vector<1xf32> + return +} + +// CHECK: ( 158.74 ) diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -869,6 +869,7 @@ if (propagateDistribution) { RewritePatternSet patterns(ctx); vector::populatePropagateWarpVectorDistributionPatterns(patterns); + vector::populateReductionToGPUWarpShufflePatterns(patterns); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); } WarpExecuteOnLane0LoweringOptions options; diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3133,6 +3133,7 @@ ":BufferizationTransforms", ":DialectUtils", ":FuncDialect", + ":GPUDialect", ":IR", ":LinalgDialect", ":MemRefDialect",