diff --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp @@ -635,8 +635,8 @@ matchPattern(hi, m_Constant(&hiInt)) && matchPattern(step, m_Constant(&stepInt))) { if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0) - return rewriter.create( - loc, mtp, rewriter.getI64ArrayAttr(codegen.curVecLength)); + return rewriter.create( + loc, mtp, rewriter.create(loc, 1, 1)); } // Otherwise, generate a vector mask that avoids overrunning the upperbound // during vector execution. Here we rely on subsequent loop optimizations to @@ -723,9 +723,13 @@ static void genTensorStore(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter, linalg::GenericOp op, unsigned tensor, Value rhs) { + Location loc = op.getLoc(); // Test if this is a scalarized reduction. unsigned lhs = op.getNumShapedOperands() - 1; if (lhs == tensor && codegen.redVal) { + if (codegen.curVecLength > 1) + rhs = rewriter.create(loc, codegen.curVecMask, rhs, + codegen.redVal); codegen.redVal = rhs; return; } @@ -736,7 +740,6 @@ unsigned idx = map.getDimPosition(i); args.push_back(codegen.loops[idx]); // universal dense index } - Location loc = op.getLoc(); Value ptr = codegen.buffers[tensor]; if (codegen.curVecLength > 1) genVectorStore(codegen, rewriter, rhs, ptr, args); diff --git a/mlir/test/Dialect/Linalg/sparse_vector.mlir b/mlir/test/Dialect/Linalg/sparse_vector.mlir --- a/mlir/test/Dialect/Linalg/sparse_vector.mlir +++ b/mlir/test/Dialect/Linalg/sparse_vector.mlir @@ -250,6 +250,37 @@ return %0 : tensor } +// +// CHECK-VEC1-LABEL: func @reduction_17 +// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-VEC1-DAG: %[[c16:.*]] = constant 16 : index +// CHECK-VEC1-DAG: %[[c17:.*]] = constant 17 : index +// CHECK-VEC1-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32> +// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c17]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) { +// CHECK-VEC1: %[[sub:.*]] = subi %[[c17]], %[[i]] : index +// CHECK-VEC1: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1> +// CHECK-VEC1: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<17xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK-VEC1: %[[lb:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<17xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> +// CHECK-VEC1: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32> +// CHECK-VEC1: %[[s:.*]] = select %[[mask]], %[[a]], %[[red_in]] : vector<16xi1>, vector<16xf32> +// CHECK-VEC1: scf.yield %[[s]] : vector<16xf32> +// CHECK-VEC1: } +// CHECK-VEC1: %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32 +// CHECK-VEC1: return +// +func @reduction_17(%arga: tensor<17xf32>, %argb: tensor<17xf32>, %argx: tensor) -> tensor { + %0 = linalg.generic #trait_reduction_d + ins(%arga, %argb: tensor<17xf32>, tensor<17xf32>) + outs(%argx: tensor) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = mulf %a, %b : f32 + %1 = addf %x, %0 : f32 + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor +} + #trait_mul_ds = { indexing_maps = [ affine_map<(i,j) -> (i,j)>, // a