diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -58,6 +58,12 @@ .Case([](arith::MulFOp) { return AtomicRMWKind::mulf; }) .Case([](arith::AddIOp) { return AtomicRMWKind::addi; }) .Case([](arith::MulIOp) { return AtomicRMWKind::muli; }) + .Case([](MinFOp) { return AtomicRMWKind::minf; }) + .Case([](MaxFOp) { return AtomicRMWKind::maxf; }) + .Case([](MinSIOp) { return AtomicRMWKind::mins; }) + .Case([](MaxSIOp) { return AtomicRMWKind::maxs; }) + .Case([](MinUIOp) { return AtomicRMWKind::minu; }) + .Case([](MaxUIOp) { return AtomicRMWKind::maxu; }) .Default([](Operation *) -> Optional { // TODO: AtomicRMW supports other kinds of reductions this is // currently not detecting, add those when the need arises. diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -251,35 +251,17 @@ case AtomicRMWKind::muli: return builder.create(loc, lhs, rhs); case AtomicRMWKind::maxf: - return builder.create( - loc, - builder.create(loc, arith::CmpFPredicate::OGT, lhs, rhs), - lhs, rhs); + return builder.create(loc, lhs, rhs); case AtomicRMWKind::minf: - return builder.create( - loc, - builder.create(loc, arith::CmpFPredicate::OLT, lhs, rhs), - lhs, rhs); + return builder.create(loc, lhs, rhs); case AtomicRMWKind::maxs: - return builder.create( - loc, - builder.create(loc, arith::CmpIPredicate::sgt, lhs, rhs), - lhs, rhs); + return builder.create(loc, lhs, rhs); case AtomicRMWKind::mins: - return builder.create( - loc, - builder.create(loc, arith::CmpIPredicate::slt, lhs, rhs), - lhs, rhs); + return builder.create(loc, lhs, rhs); case AtomicRMWKind::maxu: - return builder.create( - loc, - builder.create(loc, arith::CmpIPredicate::ugt, lhs, rhs), - lhs, rhs); + return builder.create(loc, lhs, rhs); case AtomicRMWKind::minu: - return builder.create( - loc, - builder.create(loc, arith::CmpIPredicate::ult, lhs, rhs), - lhs, rhs); + return builder.create(loc, lhs, rhs); // TODO: Add remaining reduction operations. default: (void)emitOptionalError(loc, "Reduction operation type not supported"); diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp --- a/mlir/lib/Dialect/Vector/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/VectorOps.cpp @@ -371,16 +371,28 @@ builder.getStringAttr("mul"), vector, ValueRange{}); case AtomicRMWKind::minf: + return builder.create(vector.getLoc(), scalarType, + builder.getStringAttr("minf"), + vector, ValueRange{}); case AtomicRMWKind::mins: + return builder.create(vector.getLoc(), scalarType, + builder.getStringAttr("minsi"), + vector, ValueRange{}); case AtomicRMWKind::minu: return builder.create(vector.getLoc(), scalarType, - builder.getStringAttr("min"), + builder.getStringAttr("minui"), vector, ValueRange{}); case AtomicRMWKind::maxf: + return builder.create(vector.getLoc(), scalarType, + builder.getStringAttr("maxf"), + vector, ValueRange{}); case AtomicRMWKind::maxs: + return builder.create(vector.getLoc(), scalarType, + builder.getStringAttr("maxsi"), + vector, ValueRange{}); case AtomicRMWKind::maxu: return builder.create(vector.getLoc(), scalarType, - builder.getStringAttr("max"), + builder.getStringAttr("maxui"), vector, ValueRange{}); // TODO: Add remaining reduction operations. default: diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir @@ -29,6 +29,168 @@ // ----- +func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) { + %cst = arith.constant 0x7F800000 : f32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { + %ld = affine.load %in[%i, %j] : memref<256x512xf32> + %min = minf %red_iter, %ld : f32 + affine.yield %min : f32 + } + affine.store %final_red, %out[%i] : memref<256xf32> + } + return +} + +// CHECK-LABEL: @vecdim_reduction_minf +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vmax:.*]] = arith.constant dense<0x7F800000> : vector<128xf32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xf32>) { +// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> +// CHECK: %[[min:.*]] = minf %[[red_iter]], %[[ld]] : vector<128xf32> +// CHECK: affine.yield %[[min]] : vector<128xf32> +// CHECK: } +// CHECK: %[[final_min:.*]] = vector.reduction "minf", %[[vred:.*]] : vector<128xf32> into f32 +// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xf32> +// CHECK: } + +// ----- + +func @vecdim_reduction_maxf(%in: memref<256x512xf32>, %out: memref<256xf32>) { + %cst = arith.constant 0xFF800000 : f32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { + %ld = affine.load %in[%i, %j] : memref<256x512xf32> + %max = maxf %red_iter, %ld : f32 + affine.yield %max : f32 + } + affine.store %final_red, %out[%i] : memref<256xf32> + } + return +} + +// CHECK-LABEL: @vecdim_reduction_maxf +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vmin:.*]] = arith.constant dense<0xFF800000> : vector<128xf32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xf32>) { +// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> +// CHECK: %[[max:.*]] = maxf %[[red_iter]], %[[ld]] : vector<128xf32> +// CHECK: affine.yield %[[max]] : vector<128xf32> +// CHECK: } +// CHECK: %[[final_max:.*]] = vector.reduction "maxf", %[[vred:.*]] : vector<128xf32> into f32 +// CHECK: affine.store %[[final_max]], %{{.*}} : memref<256xf32> +// CHECK: } + +// ----- + +func @vecdim_reduction_minsi(%in: memref<256x512xi32>, %out: memref<256xi32>) { + %cst = arith.constant 2147483647 : i32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { + %ld = affine.load %in[%i, %j] : memref<256x512xi32> + %min = minsi %red_iter, %ld : i32 + affine.yield %min : i32 + } + affine.store %final_red, %out[%i] : memref<256xi32> + } + return +} + +// CHECK-LABEL: @vecdim_reduction_minsi +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vmax:.*]] = arith.constant dense<2147483647> : vector<128xi32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xi32>) { +// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> +// CHECK: %[[min:.*]] = minsi %[[red_iter]], %[[ld]] : vector<128xi32> +// CHECK: affine.yield %[[min]] : vector<128xi32> +// CHECK: } +// CHECK: %[[final_min:.*]] = vector.reduction "minsi", %[[vred:.*]] : vector<128xi32> into i32 +// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xi32> +// CHECK: } + +// ----- + +func @vecdim_reduction_maxsi(%in: memref<256x512xi32>, %out: memref<256xi32>) { + %cst = arith.constant -2147483648 : i32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { + %ld = affine.load %in[%i, %j] : memref<256x512xi32> + %max = maxsi %red_iter, %ld : i32 + affine.yield %max : i32 + } + affine.store %final_red, %out[%i] : memref<256xi32> + } + return +} + +// CHECK-LABEL: @vecdim_reduction_maxsi +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vmin:.*]] = arith.constant dense<-2147483648> : vector<128xi32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xi32>) { +// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> +// CHECK: %[[max:.*]] = maxsi %[[red_iter]], %[[ld]] : vector<128xi32> +// CHECK: affine.yield %[[max]] : vector<128xi32> +// CHECK: } +// CHECK: %[[final_max:.*]] = vector.reduction "maxsi", %[[vred:.*]] : vector<128xi32> into i32 +// CHECK: affine.store %[[final_max]], %{{.*}} : memref<256xi32> +// CHECK: } + +// ----- + +func @vecdim_reduction_minui(%in: memref<256x512xi32>, %out: memref<256xi32>) { + %cst = arith.constant -1 : i32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { + %ld = affine.load %in[%i, %j] : memref<256x512xi32> + %min = minui %red_iter, %ld : i32 + affine.yield %min : i32 + } + affine.store %final_red, %out[%i] : memref<256xi32> + } + return +} + +// CHECK-LABEL: @vecdim_reduction_minui +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vmax:.*]] = arith.constant dense<-1> : vector<128xi32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xi32>) { +// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> +// CHECK: %[[min:.*]] = minui %[[red_iter]], %[[ld]] : vector<128xi32> +// CHECK: affine.yield %[[min]] : vector<128xi32> +// CHECK: } +// CHECK: %[[final_min:.*]] = vector.reduction "minui", %[[vred:.*]] : vector<128xi32> into i32 +// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xi32> +// CHECK: } + +// ----- + +func @vecdim_reduction_maxui(%in: memref<256x512xi32>, %out: memref<256xi32>) { + %cst = arith.constant 0 : i32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { + %ld = affine.load %in[%i, %j] : memref<256x512xi32> + %max = maxui %red_iter, %ld : i32 + affine.yield %max : i32 + } + affine.store %final_red, %out[%i] : memref<256xi32> + } + return +} + +// CHECK-LABEL: @vecdim_reduction_maxui +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vmin:.*]] = arith.constant dense<0> : vector<128xi32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xi32>) { +// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> +// CHECK: %[[max:.*]] = maxui %[[red_iter]], %[[ld]] : vector<128xi32> +// CHECK: affine.yield %[[max]] : vector<128xi32> +// CHECK: } +// CHECK: %[[final_max:.*]] = vector.reduction "maxui", %[[vred:.*]] : vector<128xi32> into i32 +// CHECK: affine.store %[[final_max]], %{{.*}} : memref<256xi32> +// CHECK: } + +// ----- + // The inner reduction loop '%j' is vectorized. (The order of addf's operands is // different than in the previous test case).