diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -686,7 +686,8 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce", [SameOperandsAndResultType, IsolatedFromAbove]>, Arguments<(ins AnyType:$value, - OptionalAttr:$op)>, + OptionalAttr:$op, + UnitAttr:$non_uniform)>, Results<(outs AnyType)> { let summary = "Reduce values among workgroup."; let description = [{ @@ -709,11 +710,12 @@ accumulation as code region. The accumulation operation must be one of: `add`, `and`, `max`, `min`, `mul`, `or`, `xor`. - Either none or all work items of a workgroup need to execute this op - in convergence. + If `non_uniform` flag is not set either none or all work items of a workgroup + need to execute this op in convergence. }]; let regions = (region AnyRegion:$body); - let assemblyFormat = [{ custom($op) $value $body attr-dict + let assemblyFormat = [{ custom($op) $value + (`non_uniform` $non_uniform^)? $body attr-dict `:` functional-type(operands, results) }]; let hasRegionVerifier = 1; } @@ -721,7 +723,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]>, Arguments<(ins AnyType:$value, - GPU_AllReduceOperationAttr:$op)>, + GPU_AllReduceOperationAttr:$op, + UnitAttr:$non_uniform)>, Results<(outs AnyType)> { let summary = "Reduce values among subgroup."; let description = [{ @@ -734,10 +737,11 @@ %1 = gpu.subgroup_reduce add %0 : (f32) -> (f32) ``` - Either none or all work items of a subgroup need to execute this op - in convergence. + If `non_uniform` flag is not set either none or all work items of a subgroup + need to execute this op in convergence. }]; - let assemblyFormat = [{ custom($op) $value attr-dict + let assemblyFormat = [{ custom($op) $value + (`non_uniform` $non_uniform^)? attr-dict `:` functional-type(operands, results) }]; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp --- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp @@ -395,6 +395,11 @@ PatternRewriter &rewriter) const override { auto funcOp = cast(op); auto callback = [&](gpu::AllReduceOp reduceOp) { + if (reduceOp.getNonUniform()) { + reduceOp.emitWarning("Non uniform reductions are not supported yet."); + return WalkResult::advance(); + } + GpuAllReduceRewriter(funcOp, reduceOp, rewriter).rewrite(); // Performing a rewrite invalidates the walk iterator. Report interrupt // so that we can start a new walk until all all_reduce ops are replaced. diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -83,11 +83,21 @@ %SgSi = gpu.subgroup_size : index %one = arith.constant 1.0 : f32 + + // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} { + // CHECK-NEXT: } : (f32) -> f32 %sum = gpu.all_reduce add %one {} : (f32) -> (f32) + // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} non_uniform { + // CHECK-NEXT: } : (f32) -> f32 + %sum1 = gpu.all_reduce add %one non_uniform {} : (f32) -> f32 + // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (f32) -> f32 %sum_subgroup = gpu.subgroup_reduce add %one : (f32) -> f32 + // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} non_uniform : (f32) -> f32 + %sum_subgroup1 = gpu.subgroup_reduce add %one non_uniform : (f32) -> f32 + %width = arith.constant 7 : i32 %offset = arith.constant 3 : i32 // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32