diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns b/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns --- a/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns +++ b/mlir/test/Integration/Dialect/SparseTensor/taco/data/nell-2.tns @@ -1,3 +1,8 @@ +# Extended FROSTT format: +# rank number-non-zero-elements +# dimension-sizes +3 5 +2 4 4 1 1 1 1.0 1 2 2 2.0 1 3 4 3.0 diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py --- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py +++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py @@ -532,6 +532,24 @@ return tuple(self._get_element(idx).dst_format.format_pack.formats) +class _SparseValueInfo(enum.Enum): + """Describes how a sparse tensor value is stored. + _UNPACKED: The sparse tensor value is stored as (coordnates, values) in + Python. + _PACKED: The sparse tensor value is stored as a C pointer to a packed MLIR + sparse tensor. + """ + _UNPACKED = 0 + _PACKED = 1 + + +@dataclasses.dataclass(frozen=True) +class _Assignment: + """Records an assignment to a tensor T as T[indices] = expression.""" + indices: Tuple["IndexVar", ...] + expression: "IndexExpr" + + class Tensor: """The tensor class. @@ -622,12 +640,14 @@ self._name = name or self._get_unique_name() self._dtype = dtype + self._assignment = None # We currently use _coords and _values to host the sparse tensor value with # COO format, and _dense_storage to host the dense tensor value. We haven't # implement the conversion between the two storages yet. This will be # improved in a follow up CL. self._coords = [] self._values = [] + self._sparse_value_location = _SparseValueInfo._UNPACKED self._dense_storage = None self._stats = _Stats() if value_or_shape is None or isinstance(value_or_shape, int) or isinstance( @@ -647,7 +667,29 @@ "Must be a tuple or list for a shape or a single value" f"if initializing a scalar tensor: {value_or_shape}.") + def is_unpacked(self) -> bool: + """Returns true if the tensor value is not packed as MLIR sparse tensor.""" + return (self._sparse_value_location == _SparseValueInfo._UNPACKED) + + def unpack(self) -> None: + """Unpacks the MLIR sparse tensor representation.""" + if self.is_dense() or self.is_unpacked(): + return + + # Use the output MLIR sparse tensor pointer to retrieve the COO-flavored + # values and verify the values. + rank, nse, shape, values, indices = utils.sparse_tensor_to_coo_tensor( + self._packed_sparse_value, np.float64) + assert rank == self.order + assert np.allclose(self.shape, shape) + assert nse == len(values) + self._coords = indices + self._values = values + self._sparse_value_location = _SparseValueInfo._UNPACKED + def __repr__(self) -> str: + self._sync_value() + self._unpack() value_str = (f"{repr(self._dense_storage)})" if self.is_dense() else f"{repr(self._coords)} {repr(self._values)})") return (f"Tensor(_name={repr(self._name)} " @@ -665,6 +707,11 @@ Raises: ValueError: When there is any problem in the parameters. """ + if self.is_dense(): + raise ValueError("Insert method is not supported for dense tensors.") + if self._assignment != None or not self.is_unpacked(): + raise ValueError( + "Can't use Insert method for a tensor constructed from a file.") if not isinstance(coords, list): raise ValueError(f"Non list coordinate detected: {coords}.") if not _all_instance_of(coords, int): @@ -692,6 +739,9 @@ if not self.is_dense(): raise ValueError("Conversion from non-dense Tensor " "to numpy array not supported yet.") + + self._sync_value() + return self._dense_storage @staticmethod @@ -755,6 +805,32 @@ return tensor + @staticmethod + def from_file( + filename: str, + fmt: Format, + dtype: DType, + ) -> "Tensor": + """Constructs a sparse tensor using the COO-flavored values from a file. + + Args: + filename: A string for the name of the file that contains the sparse + tensor data. + fmt: The tensor storage format. + dtype: The tensor element data type. + + Returns: + A tensor with the given non-zero values and storage format. The tensor + value is stored as an MLIR sparse tensor. + """ + sparse_tensor, shape = utils.create_sparse_tensor(filename, + fmt.format_pack.formats) + tensor = Tensor(shape.tolist(), fmt) + tensor._sparse_value_location = _SparseValueInfo._PACKED + tensor._packed_sparse_value = sparse_tensor + + return tensor + @property def dtype(self) -> DType: """Returns the data type for the Tensor.""" @@ -827,7 +903,13 @@ raise ValueError("Mismatch between indices and tensor rank: " f"len({indices}) != {self.order}.") - result = value.evaluate(self, indices) + self._assignment = _Assignment(indices, value) + + def evaluate(self) -> None: + """Evaluates the assignment to the tensor.""" + result = self._assignment.expression.evaluate(self, + self._assignment.indices) + self._assignment = None if self.is_dense(): assert isinstance(result, np.ndarray) self._dense_storage = result @@ -836,6 +918,11 @@ assert (result[0].ndim, result[1].ndim) == (1, 2) (self._values, self._coords) = result + def _sync_value(self) -> None: + """Updates the tensor value by evaluating the pending assignment.""" + if self._assignment is not None: + self.evaluate() + def mlir_tensor_type(self) -> ir.RankedTensorType: """Returns the MLIR type for the tensor.""" return _mlir_tensor_type(self._dtype, tuple(self._shape), @@ -860,16 +947,21 @@ self._dense_storage = np.zeros(self._shape, self._dtype.value) return _ctype_pointer_from_array(self._dense_storage) - shape = np.array(self._shape, np.int64) - indices = np.array(self._coords, np.int64) - values = np.array(self._values, self._dtype.value) - ptr = utils.coo_tensor_to_sparse_tensor(shape, values, indices) + if self.is_unpacked(): + shape = np.array(self._shape, np.int64) + indices = np.array(self._coords, np.int64) + values = np.array(self._values, self._dtype.value) + ptr = utils.coo_tensor_to_sparse_tensor(shape, values, indices) + else: + ptr = self._packed_sparse_value + return ctypes.pointer(ctypes.cast(ptr, ctypes.c_void_p)) def get_coordinates_and_values( self) -> Tuple[List[Tuple[int, ...]], List[_AnyRuntimeType]]: """Returns the coordinates and values for the non-zero elements.""" if not self.is_dense(): + assert (self.is_unpacked()) return (self._coords, self._values) # Coordinates for non-zero elements, grouped by dimensions. diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py --- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py +++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py @@ -30,118 +30,6 @@ _MTX_FILENAME_SUFFIX = ".mtx" _TNS_FILENAME_SUFFIX = ".tns" -_MTX_HEAD = "%%MatrixMarket" -_MTX_MATRIX = "matrix" -_MTX_COORDINATE = "coordinate" -_MTX_REAL = "real" -_MTX_SYMMETRY = "symmetric" -_MTX_GENERAL = "general" -_SYMMETRY_FIELD_ID = 4 - -# The TACO supported header for .mtx has the following five fields: -# . %%MatrixMarket -# . matrix | tensor -# . coordinate | array -# . real -# . symmetric | general -# -# This is what we support currently. -_SUPPORTED_HEADER_FIELDS = ((_MTX_HEAD,), (_MTX_MATRIX,), (_MTX_COORDINATE,), - (_MTX_REAL,), (_MTX_GENERAL, _MTX_SYMMETRY)) - -_A_SPACE = " " -_MTX_COMMENT = "%" -_TNS_COMMENT = "#" - - -def _coordinate_from_strings(strings: List[str]) -> List[int]: - """"Return the coordinate represented by the input strings.""" - # Coordinates are 1-based in the text file and 0-based in memory. - return [int(s) - 1 for s in strings] - - -def _read_coordinate_format(file: TextIO, tensor: Tensor, - is_symmetric: bool) -> None: - """Reads tensor values in coordinate format.""" - rank = tensor.order - # Process the data for the tensor. - for line in file: - if not line: - continue - - fields = line.split(_A_SPACE) - if rank != len(fields) - 1: - raise ValueError("The format and data have mismatched ranks: " - f"{rank} vs {len(fields)-1}.") - coordinate = _coordinate_from_strings(fields[:-1]) - value = float(fields[-1]) - tensor.insert(coordinate, value) - if is_symmetric and coordinate[0] != coordinate[-1]: - coordinate.reverse() - tensor.insert(coordinate, value) - - -def _read_mtx(file: TextIO, fmt: Format) -> Tensor: - """Inputs tensor from a text file with .mtx format.""" - # The first line should have this five fields: - # head tensor-kind format data-type symmetry - fields = file.readline().rstrip("\n").split(_A_SPACE) - tuple_to_str = lambda x: "|".join(x) - if len(fields) != len(_SUPPORTED_HEADER_FIELDS): - raise ValueError( - "Expected first line with theses fields " - f"{' '.join(map(tuple_to_str, _SUPPORTED_HEADER_FIELDS))}: " - f"{' '.join(fields)}") - - for i, values in enumerate(_SUPPORTED_HEADER_FIELDS): - if fields[i] not in values: - raise ValueError(f"The {i}th field can only be one of these values " - f"{tuple_to_str(values)}: {fields[i]}") - - is_symmetric = (fields[_SYMMETRY_FIELD_ID] == _MTX_SYMMETRY) - # Skip leading empty lines or comment lines. - line = file.readline() - while not line or line[0] == _MTX_COMMENT: - line = file.readline() - - # Process the first data line with dimensions and number of non-zero values. - fields = line.split(_A_SPACE) - rank = fmt.rank() - if rank != len(fields) - 1: - raise ValueError("The format and data have mismatched ranks: " - f"{rank} vs {len(fields)-1}.") - shape = fields[:-1] - shape = [int(s) for s in shape] - num_non_zero = float(fields[-1]) - - # Read the tensor values in coordinate format. - tensor = Tensor(shape, fmt) - _read_coordinate_format(file, tensor, is_symmetric) - return tensor - - -def _read_tns(file: TextIO, fmt: Format) -> Tensor: - """Inputs tensor from a text file with .tns format.""" - rank = fmt.rank() - coordinates = [] - values = [] - dtype = DType(Type.FLOAT64) - - for line in file: - # Skip empty lines and comment lines. - if not line or line[0] == _TNS_COMMENT: - continue - - # Process each line with a coordinate and the value at the coordinate. - fields = line.split(_A_SPACE) - if rank != len(fields) - 1: - raise ValueError("The format and data have mismatched ranks: " - f"{rank} vs {len(fields)-1}.") - coordinates.append(tuple(_coordinate_from_strings(fields[:-1]))) - values.append(dtype.value(fields[-1])) - - return Tensor.from_coo(coordinates, values, fmt, dtype) - def _write_tns(file: TextIO, tensor: Tensor) -> None: """Outputs a tensor to a file using .tns format.""" @@ -177,9 +65,7 @@ if not isinstance(fmt, Format) or fmt.is_dense(): raise ValueError(f"Expected a sparse Format object: {fmt}.") - with open(filename, "r") as file: - return (_read_mtx(file, fmt) if filename.endswith(_MTX_FILENAME_SUFFIX) else - _read_tns(file, fmt)) + return Tensor.from_file(filename, fmt, DType(Type.FLOAT64)) def write(filename: str, tensor: Tensor) -> None: @@ -202,5 +88,7 @@ if not isinstance(tensor, Tensor): raise ValueError(f"Expected a Tensor object: {tensor}.") + # TODO: combine the evaluation and the outputing into one step. + tensor._sync_value() with open(filename, "w") as file: return _write_tns(file, tensor) diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py b/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_io.py @@ -0,0 +1,110 @@ +# RUN: SUPPORTLIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s + +from string import Template + +import numpy as np +import os +import sys +import tempfile + +_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_PATH) +from tools import mlir_pytaco +from tools import mlir_pytaco_io +from tools import mlir_pytaco_utils as pytaco_utils + +# Define the aliases to shorten the code. +_COMPRESSED = mlir_pytaco.ModeFormat.COMPRESSED +_DENSE = mlir_pytaco.ModeFormat.DENSE + + +def _run(f): + print(f.__name__) + f() + return f + + +_FORMAT = mlir_pytaco.Format([_COMPRESSED, _COMPRESSED]) +_MTX_DATA_TEMPLATE = Template( + """%%MatrixMarket matrix coordinate real $general_or_symmetry +3 3 3 +3 1 3 +1 2 2 +3 2 4 +""") + + +def _get_mtx_data(value): + mtx_data = _MTX_DATA_TEMPLATE + return mtx_data.substitute(general_or_symmetry=value) + + +# CHECK-LABEL: test_read_mtx_matrix_general +@_run +def test_read_mtx_matrix_general(): + with tempfile.TemporaryDirectory() as test_dir: + file_name = os.path.join(test_dir, "data.mtx") + with open(file_name, "w") as file: + file.write(_get_mtx_data("general")) + a = mlir_pytaco_io.read(file_name, _FORMAT) + passed = 0 + # The value of a is stored as an MLIR sparse tensor. + passed += (not a.is_unpacked()) + a.unpack() + passed += (a.is_unpacked()) + coords, values = a.get_coordinates_and_values() + passed += np.allclose(coords, [[0, 1], [2, 0], [2, 1]]) + passed += np.allclose(values, [2.0, 3.0, 4.0]) + # CHECK: 4 + print(passed) + + +# CHECK-LABEL: test_read_mtx_matrix_symmetry +@_run +def test_read_mtx_matrix_symmetry(): + with tempfile.TemporaryDirectory() as test_dir: + file_name = os.path.join(test_dir, "data.mtx") + with open(file_name, "w") as file: + file.write(_get_mtx_data("symmetric")) + a = mlir_pytaco_io.read(file_name, _FORMAT) + passed = 0 + # The value of a is stored as an MLIR sparse tensor. + passed += (not a.is_unpacked()) + a.unpack() + passed += (a.is_unpacked()) + coords, values = a.get_coordinates_and_values() + print(coords) + print(values) + passed += np.allclose(coords, + [[0, 1], [0, 2], [1, 0], [1, 2], [2, 0], [2, 1]]) + passed += np.allclose(values, [2.0, 3.0, 2.0, 4.0, 3.0, 4.0]) + # CHECK: 4 + print(passed) + + +_TNS_DATA = """2 3 +3 2 +3 1 3 +1 2 2 +3 2 4 +""" + + +# CHECK-LABEL: test_read_tns +@_run +def test_read_tns(): + with tempfile.TemporaryDirectory() as test_dir: + file_name = os.path.join(test_dir, "data.tns") + with open(file_name, "w") as file: + file.write(_TNS_DATA) + a = mlir_pytaco_io.read(file_name, _FORMAT) + passed = 0 + # The value of a is stored as an MLIR sparse tensor. + passed += (not a.is_unpacked()) + a.unpack() + passed += (a.is_unpacked()) + coords, values = a.get_coordinates_and_values() + passed += np.allclose(coords, [[0, 1], [2, 0], [2, 1]]) + passed += np.allclose(values, [2.0, 3.0, 4.0]) + # CHECK: 4 + print(passed)