diff --git a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp @@ -76,6 +76,10 @@ name = "sparsePointers64"; else if (eltType.isInteger(32)) name = "sparsePointers32"; + else if (eltType.isInteger(16)) + name = "sparsePointers16"; + else if (eltType.isInteger(8)) + name = "sparsePointers8"; else return failure(); rewriter.replaceOpWithNewOp( @@ -100,6 +104,10 @@ name = "sparseIndices64"; else if (eltType.isInteger(32)) name = "sparseIndices32"; + else if (eltType.isInteger(16)) + name = "sparseIndices16"; + else if (eltType.isInteger(8)) + name = "sparseIndices8"; else return failure(); rewriter.replaceOpWithNewOp( diff --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp @@ -614,18 +614,21 @@ } } +/// Constructs vector type. +static VectorType vectorType(CodeGen &codegen, Type etp) { + return VectorType::get(codegen.curVecLength, etp); +} + /// Constructs vector type from pointer. static VectorType vectorType(CodeGen &codegen, Value ptr) { - Type etp = ptr.getType().cast().getElementType(); - return VectorType::get(codegen.curVecLength, etp); + return vectorType(codegen, ptr.getType().cast().getElementType()); } /// Constructs vector iteration mask. static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter, Value iv, Value lo, Value hi, Value step) { Location loc = iv.getLoc(); - VectorType mtp = - VectorType::get(codegen.curVecLength, rewriter.getIntegerType(1)); + VectorType mtp = vectorType(codegen, rewriter.getIntegerType(1)); // Special case if the vector length evenly divides the trip count (for // example, "for i = 0, 128, 16"). A constant all-true mask is generated // so that all subsequent masked memory operations are immediately folded @@ -683,7 +686,7 @@ /// optimizations to hoist the invariant broadcast out of the vector loop. static Value genVectorInvariantValue(CodeGen &codegen, PatternRewriter &rewriter, Value val) { - VectorType vtp = VectorType::get(codegen.curVecLength, val.getType()); + VectorType vtp = vectorType(codegen, val.getType()); return rewriter.create(val.getLoc(), vtp, val); } @@ -747,15 +750,47 @@ rewriter.create(loc, rhs, ptr, args); } -/// Generates a pointer/index load from the sparse storage scheme. +/// Generates a pointer/index load from the sparse storage scheme. Narrower +/// data types need to be zero extended before casting the value into the +/// index type used for looping and indexing. static Value genLoad(CodeGen &codegen, PatternRewriter &rewriter, Location loc, Value ptr, Value s) { - if (codegen.curVecLength > 1) - return genVectorLoad(codegen, rewriter, ptr, {s}); + // See https://llvm.org/docs/GetElementPtr.html for some background on + // the complications described below. + if (codegen.curVecLength > 1) { + // Since the index vector is used in a subsequent gather/scatter operations, + // which effectively defines an unsigned pointer + signed index, we must + // zero extend the vector to an index width. For 8-bit and 16-bit values, + // an 32-bit index width suffices. For 32-bit values, zero extending the + // elements into 64-bit loses some performance since the 32-bit indexed + // gather/scatter is more efficient than the 64-bit index variant (in + // the future, we could introduce a flag that states the negative space + // of 32-bit indices is unused). For 64-bit values, there is no good way + // to state that the indices are unsigned, with creates the potential of + // incorrect address calculations in the unlikely case we need such + // extremely large offsets. + Type etp = ptr.getType().cast().getElementType(); + Value vload = genVectorLoad(codegen, rewriter, ptr, {s}); + if (etp.getIntOrFloatBitWidth() < 32) + vload = rewriter.create( + loc, vload, vectorType(codegen, rewriter.getIntegerType(32))); + else if (etp.getIntOrFloatBitWidth() < 64) + vload = rewriter.create( + loc, vload, vectorType(codegen, rewriter.getIntegerType(64))); + return vload; + } + // For the scalar case, we simply zero extend narrower indices into 64-bit + // values before casting to index without a performance penalty. Here too, + // however, indices that already are 64-bit, in theory, cannot express the + // full range as explained above. Value load = rewriter.create(loc, ptr, s); - return load.getType().isa() - ? load - : rewriter.create(loc, load, rewriter.getIndexType()); + if (!load.getType().isa()) { + if (load.getType().getIntOrFloatBitWidth() < 64) + load = rewriter.create(loc, load, + rewriter.getIntegerType(64)); + load = rewriter.create(loc, load, rewriter.getIndexType()); + } + return load; } /// Generates an invariant value. @@ -959,8 +994,10 @@ if (!merger.isSparseTensor(t) && !linkedSparse(op, t)) { auto map = op.getIndexingMap(t); unsigned r = map.getNumResults(); - if (r && map.getDimPosition(r - 1) != idx) - return false; + for (unsigned i = 0; i < r; i++) { + if (map.getDimPosition(i) == idx && i != r - 1) + return false; + } } } return true; diff --git a/mlir/lib/ExecutionEngine/SparseUtils.cpp b/mlir/lib/ExecutionEngine/SparseUtils.cpp --- a/mlir/lib/ExecutionEngine/SparseUtils.cpp +++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp @@ -113,12 +113,21 @@ class SparseTensorStorageBase { public: virtual uint64_t getDimSize(uint64_t) = 0; + + // Overhead storage. virtual void getPointers(std::vector **, uint64_t) { fatal("p64"); } virtual void getPointers(std::vector **, uint64_t) { fatal("p32"); } + virtual void getPointers(std::vector **, uint64_t) { fatal("p16"); } + virtual void getPointers(std::vector **, uint64_t) { fatal("p8"); } virtual void getIndices(std::vector **, uint64_t) { fatal("i64"); } virtual void getIndices(std::vector **, uint64_t) { fatal("i32"); } + virtual void getIndices(std::vector **, uint64_t) { fatal("i16"); } + virtual void getIndices(std::vector **, uint64_t) { fatal("i8"); } + + // Primary storage. virtual void getValues(std::vector **) { fatal("valf64"); } virtual void getValues(std::vector **) { fatal("valf32"); } + virtual ~SparseTensorStorageBase() {} private: @@ -464,6 +473,22 @@ uint64_t strides[1]; }; +struct MemRef1DU16 { + const uint16_t *base; + const uint16_t *data; + uint64_t off; + uint64_t sizes[1]; + uint64_t strides[1]; +}; + +struct MemRef1DU8 { + const uint8_t *base; + const uint8_t *data; + uint64_t off; + uint64_t sizes[1]; + uint64_t strides[1]; +}; + struct MemRef1DF64 { const double *base; const double *data; @@ -480,41 +505,42 @@ uint64_t strides[1]; }; -enum TypeEnum : uint64_t { kF64 = 0, kF32 = 1, kU64 = 2, kU32 = 3 }; +enum OverheadTypeEnum : uint64_t { kU64 = 1, kU32 = 2, kU16 = 3, kU8 = 4 }; +enum PrimaryTypeEnum : uint64_t { kF64 = 1, kF32 = 2 }; + +#define CASE(p, i, v, P, I, V) \ + if (ptrTp == (p) && indTp == (i) && valTp == (v)) \ + return newSparseTensor(filename, sparsity, asize) void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff, uint64_t asize, uint64_t astride, uint64_t ptrTp, uint64_t indTp, uint64_t valTp) { assert(astride == 1); bool *sparsity = abase + aoff; - if (ptrTp == kU64 && indTp == kU64 && valTp == kF64) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU64 && indTp == kU64 && valTp == kF32) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU64 && indTp == kU32 && valTp == kF64) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU64 && indTp == kU32 && valTp == kF32) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU32 && indTp == kU64 && valTp == kF64) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU32 && indTp == kU64 && valTp == kF32) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU32 && indTp == kU32 && valTp == kF64) - return newSparseTensor(filename, sparsity, - asize); - if (ptrTp == kU32 && indTp == kU32 && valTp == kF32) - return newSparseTensor(filename, sparsity, - asize); + + // The most common cases: 64-bit or 32-bit overhead, double/float values. + CASE(kU64, kU64, kF64, uint64_t, uint64_t, double); + CASE(kU64, kU64, kF32, uint64_t, uint64_t, float); + CASE(kU64, kU32, kF64, uint64_t, uint32_t, double); + CASE(kU64, kU32, kF32, uint64_t, uint32_t, float); + CASE(kU32, kU64, kF64, uint32_t, uint64_t, double); + CASE(kU32, kU64, kF32, uint32_t, uint64_t, float); + CASE(kU32, kU32, kF64, uint32_t, uint32_t, double); + CASE(kU32, kU32, kF32, uint32_t, uint32_t, float); + + // Some special cases: low overhead storage, double/float values. + CASE(kU16, kU16, kF64, uint16_t, uint16_t, double); + CASE(kU8, kU8, kF64, uint8_t, uint8_t, double); + CASE(kU16, kU16, kF32, uint16_t, uint16_t, float); + CASE(kU8, kU8, kF32, uint8_t, uint8_t, float); + + // Unsupported case (add above if needed). fputs("unsupported combination of types\n", stderr); exit(1); } +#undef CASE + uint64_t sparseDimSize(void *tensor, uint64_t d) { return static_cast(tensor)->getDimSize(d); } @@ -531,6 +557,18 @@ return {v->data(), v->data(), 0, {v->size()}, {1}}; } +MemRef1DU16 sparsePointers16(void *tensor, uint64_t d) { + std::vector *v; + static_cast(tensor)->getPointers(&v, d); + return {v->data(), v->data(), 0, {v->size()}, {1}}; +} + +MemRef1DU8 sparsePointers8(void *tensor, uint64_t d) { + std::vector *v; + static_cast(tensor)->getPointers(&v, d); + return {v->data(), v->data(), 0, {v->size()}, {1}}; +} + MemRef1DU64 sparseIndices64(void *tensor, uint64_t d) { std::vector *v; static_cast(tensor)->getIndices(&v, d); @@ -543,6 +581,18 @@ return {v->data(), v->data(), 0, {v->size()}, {1}}; } +MemRef1DU16 sparseIndices16(void *tensor, uint64_t d) { + std::vector *v; + static_cast(tensor)->getIndices(&v, d); + return {v->data(), v->data(), 0, {v->size()}, {1}}; +} + +MemRef1DU8 sparseIndices8(void *tensor, uint64_t d) { + std::vector *v; + static_cast(tensor)->getIndices(&v, d); + return {v->data(), v->data(), 0, {v->size()}, {1}}; +} + MemRef1DF64 sparseValuesF64(void *tensor) { std::vector *v; static_cast(tensor)->getValues(&v); diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -36,6 +36,7 @@ # Copy test data over. file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.mtx ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.tns + ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/wide.mtx DESTINATION ${MLIR_INTEGRATION_TEST_DIR}/data/) endif() diff --git a/mlir/test/Dialect/Linalg/sparse_storage.mlir b/mlir/test/Dialect/Linalg/sparse_storage.mlir --- a/mlir/test/Dialect/Linalg/sparse_storage.mlir +++ b/mlir/test/Dialect/Linalg/sparse_storage.mlir @@ -51,7 +51,8 @@ // CHECK-TYPE1: %[[B1:.*]] = index_cast %[[P1]] : i64 to index // CHECK-TYPE1: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] { // CHECK-TYPE1: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref -// CHECK-TYPE1: %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index +// CHECK-TYPE1: %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64 +// CHECK-TYPE1: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index // CHECK-TYPE1: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref // CHECK-TYPE1: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64> // CHECK-TYPE1: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64 @@ -62,9 +63,11 @@ // CHECK-TYPE2: %[[C0:.*]] = constant 0 : index // CHECK-TYPE2: %[[C1:.*]] = constant 1 : index // CHECK-TYPE2: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref -// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[P0]] : i32 to index +// CHECK-TYPE2: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64 +// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index // CHECK-TYPE2: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref -// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[P1]] : i32 to index +// CHECK-TYPE2: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64 +// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index // CHECK-TYPE2: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] { // CHECK-TYPE2: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref // CHECK-TYPE2: %[[INDC:.*]] = index_cast %[[IND0]] : i64 to index @@ -78,12 +81,15 @@ // CHECK-TYPE3: %[[C0:.*]] = constant 0 : index // CHECK-TYPE3: %[[C1:.*]] = constant 1 : index // CHECK-TYPE3: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref -// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[P0]] : i32 to index +// CHECK-TYPE3: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64 +// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index // CHECK-TYPE3: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref -// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[P1]] : i32 to index +// CHECK-TYPE3: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64 +// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index // CHECK-TYPE3: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] { // CHECK-TYPE3: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref -// CHECK-TYPE3: %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index +// CHECK-TYPE3: %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64 +// CHECK-TYPE3: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index // CHECK-TYPE3: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref // CHECK-TYPE3: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64> // CHECK-TYPE3: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64 @@ -94,12 +100,15 @@ // CHECK-TYPE4: %[[C0:.*]] = constant 0 : index // CHECK-TYPE4: %[[C1:.*]] = constant 1 : index // CHECK-TYPE4: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref -// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[P0]] : i16 to index +// CHECK-TYPE4: %[[Z0:.*]] = zexti %[[P0]] : i16 to i64 +// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index // CHECK-TYPE4: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref -// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[P1]] : i16 to index +// CHECK-TYPE4: %[[Z1:.*]] = zexti %[[P1]] : i16 to i64 +// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index // CHECK-TYPE4: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] { // CHECK-TYPE4: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref -// CHECK-TYPE4: %[[INDC:.*]] = index_cast %[[IND0]] : i16 to index +// CHECK-TYPE4: %[[ZEXT:.*]] = zexti %[[IND0]] : i16 to i64 +// CHECK-TYPE4: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index // CHECK-TYPE4: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref // CHECK-TYPE4: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64> // CHECK-TYPE4: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64 @@ -110,12 +119,15 @@ // CHECK-TYPE5: %[[C0:.*]] = constant 0 : index // CHECK-TYPE5: %[[C1:.*]] = constant 1 : index // CHECK-TYPE5: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref -// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[P0]] : i8 to index +// CHECK-TYPE5: %[[Z0:.*]] = zexti %[[P0]] : i8 to i64 +// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index // CHECK-TYPE5: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref -// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[P1]] : i8 to index +// CHECK-TYPE5: %[[Z1:.*]] = zexti %[[P1]] : i8 to i64 +// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index // CHECK-TYPE5: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] { // CHECK-TYPE5: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref -// CHECK-TYPE5: %[[INDC:.*]] = index_cast %[[IND0]] : i8 to index +// CHECK-TYPE5: %[[ZEXT:.*]] = zexti %[[IND0]] : i8 to i64 +// CHECK-TYPE5: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index // CHECK-TYPE5: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref // CHECK-TYPE5: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64> // CHECK-TYPE5: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64 diff --git a/mlir/test/Dialect/Linalg/sparse_vector.mlir b/mlir/test/Dialect/Linalg/sparse_vector.mlir --- a/mlir/test/Dialect/Linalg/sparse_vector.mlir +++ b/mlir/test/Dialect/Linalg/sparse_vector.mlir @@ -85,12 +85,15 @@ // CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index // CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index // CHECK-VEC0: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref -// CHECK-VEC0: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC0: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC0: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC0: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref -// CHECK-VEC0: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC0: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC0: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC0: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] { // CHECK-VEC0: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref -// CHECK-VEC0: %[[ci:.*]] = index_cast %[[li]] : i32 to index +// CHECK-VEC0: %[[zi:.*]] = zexti %[[li]] : i32 to i64 +// CHECK-VEC0: %[[ci:.*]] = index_cast %[[zi]] : i64 to index // CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref // CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32> // CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32 @@ -102,12 +105,15 @@ // CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index // CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index // CHECK-VEC1: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref -// CHECK-VEC1: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC1: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC1: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC1: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref -// CHECK-VEC1: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC1: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC1: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC1: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] { // CHECK-VEC1: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref -// CHECK-VEC1: %[[ci:.*]] = index_cast %[[li]] : i32 to index +// CHECK-VEC1: %[[zi:.*]] = zexti %[[li]] : i32 to i64 +// CHECK-VEC1: %[[ci:.*]] = index_cast %[[zi]] : i64 to index // CHECK-VEC1: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref // CHECK-VEC1: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32> // CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32 @@ -120,17 +126,20 @@ // CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index // CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index // CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref -// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC2: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC2: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref -// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC2: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC2: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] { // CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[i]] : index // CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1> // CHECK-VEC2: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref, vector<16xi1>, vector<16xi32> into vector<16xi32> +// CHECK-VEC2: %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64> // CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref, vector<16xi1>, vector<16xf32> into vector<16xf32> -// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32> // CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> -// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> +// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> // CHECK-VEC2: } // CHECK-VEC2: return // @@ -151,17 +160,20 @@ // CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index // CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index // CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref -// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC2: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC2: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref -// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC2: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC2: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] { // CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[i]] : index // CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1> // CHECK-VEC2: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref, vector<16xi1>, vector<16xi32> into vector<16xi32> +// CHECK-VEC2: %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64> // CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref, vector<16xi1>, vector<16xf32> into vector<16xf32> -// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32> // CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> -// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> +// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> // CHECK-VEC2: } // CHECK-VEC2: return // @@ -303,13 +315,16 @@ // CHECK-VEC0-DAG: %[[c512:.*]] = constant 512 : index // CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] { // CHECK-VEC0: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref -// CHECK-VEC0: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC0: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC0: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC0: %[[a:.*]] = addi %[[i]], %[[c1]] : index // CHECK-VEC0: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref -// CHECK-VEC0: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC0: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC0: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC0: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] { // CHECK-VEC0: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref -// CHECK-VEC0: %[[cj:.*]] = index_cast %[[lj]] : i32 to index +// CHECK-VEC0: %[[zj:.*]] = zexti %[[lj]] : i32 to i64 +// CHECK-VEC0: %[[cj:.*]] = index_cast %[[zj]] : i64 to index // CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref // CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32> // CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32 @@ -324,13 +339,16 @@ // CHECK-VEC1-DAG: %[[c512:.*]] = constant 512 : index // CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] { // CHECK-VEC1: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref -// CHECK-VEC1: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC1: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC1: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC1: %[[a:.*]] = addi %[[i]], %[[c1]] : index // CHECK-VEC1: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref -// CHECK-VEC1: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC1: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC1: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC1: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] { // CHECK-VEC1: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref -// CHECK-VEC1: %[[cj:.*]] = index_cast %[[lj]] : i32 to index +// CHECK-VEC1: %[[zj:.*]] = zexti %[[lj]] : i32 to i64 +// CHECK-VEC1: %[[cj:.*]] = index_cast %[[zj]] : i64 to index // CHECK-VEC1: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref // CHECK-VEC1: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32> // CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32 @@ -346,18 +364,21 @@ // CHECK-VEC2-DAG: %[[c512:.*]] = constant 512 : index // CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] { // CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref -// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index +// CHECK-VEC2: %[[a:.*]] = zexti %[[p]] : i32 to i64 +// CHECK-VEC2: %[[q:.*]] = index_cast %[[a]] : i64 to index // CHECK-VEC2: %[[a:.*]] = addi %[[i]], %[[c1]] : index // CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref -// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index +// CHECK-VEC2: %[[b:.*]] = zexti %[[r]] : i32 to i64 +// CHECK-VEC2: %[[s:.*]] = index_cast %[[b]] : i64 to index // CHECK-VEC2: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] { // CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[j]] : index // CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1> // CHECK-VEC2: %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref, vector<16xi1>, vector<16xi32> into vector<16xi32> +// CHECK-VEC2: %[[zj:.*]] = zexti %[[lj]] : vector<16xi32> to vector<16xi64> // CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref, vector<16xi1>, vector<16xf32> into vector<16xf32> -// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32> // CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32> -// CHECK-VEC2: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> +// CHECK-VEC2: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> // CHECK-VEC2: } // CHECK-VEC2: } // CHECK-VEC2: return diff --git a/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir b/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir --- a/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir +++ b/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir @@ -66,7 +66,6 @@ func private @getTensorFilename(index) -> (!Filename) func private @newSparseTensor(!Filename, memref, index, index, index) -> (!SparseTensor) func private @delSparseTensor(!SparseTensor) -> () - func private @print_memref_f32(%ptr : tensor<*xf32>) // // Main driver that reads matrix from file and calls the sparse kernel. @@ -86,8 +85,8 @@ %sparse = constant true memref.store %sparse, %annotations[%c0] : memref memref.store %sparse, %annotations[%c1] : memref - %i32 = constant 3 : index - %f32 = constant 1 : index + %i32 = constant 2 : index + %f32 = constant 2 : index // Setup memory for the dense matrices and initialize. %adata = memref.alloc(%c5, %c10) : memref diff --git a/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir b/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir --- a/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir +++ b/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir @@ -58,7 +58,6 @@ func private @getTensorFilename(index) -> (!Filename) func private @newSparseTensor(!Filename, memref, index, index, index) -> (!SparseTensor) func private @delSparseTensor(!SparseTensor) -> () - func private @print_memref_f64(%ptr : tensor<*xf64>) // // Main driver that reads matrix from file and calls the sparse kernel. @@ -76,8 +75,8 @@ %sparse = constant true memref.store %sparse, %annotations[%c0] : memref memref.store %sparse, %annotations[%c1] : memref - %i64 = constant 2 : index - %f64 = constant 0 : index + %i64 = constant 1 : index + %f64 = constant 1 : index // Setup memory for a single reduction scalar, // initialized to zero. diff --git a/mlir/test/Integration/Sparse/sparse_matvec.mlir b/mlir/test/Integration/Sparse/sparse_matvec.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Sparse/sparse_matvec.mlir @@ -0,0 +1,140 @@ +// RUN: mlir-opt %s \ +// RUN: --test-sparsification="lower ptr-type=4 ind-type=4" \ +// RUN: --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \ +// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ +// RUN: --std-bufferize --finalizing-bufferize \ +// RUN: --convert-vector-to-llvm --convert-std-to-llvm | \ +// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s +// +// RUN: mlir-opt %s \ +// RUN: --test-sparsification="lower vectorization-strategy=2 ptr-type=4 ind-type=4 vl=16" \ +// RUN: --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \ +// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ +// RUN: --std-bufferize --finalizing-bufferize \ +// RUN: --convert-vector-to-llvm --convert-std-to-llvm | \ +// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// +// Use descriptive names for opaque pointers. +// +!Filename = type !llvm.ptr +!SparseTensor = type !llvm.ptr + +#matvec = { + indexing_maps = [ + affine_map<(i,j) -> (i,j)>, // A + affine_map<(i,j) -> (j)>, // b + affine_map<(i,j) -> (i)> // x (out) + ], + sparse = [ + [ "D", "S" ], // A + [ "D" ], // b + [ "D" ] // x + ], + iterator_types = ["parallel", "reduction"], + doc = "X(i) += A(i,j) * B(j)" +} + +// +// Integration test that lowers a kernel annotated as sparse to +// actual sparse code, initializes a matching sparse storage scheme +// from file, and runs the resulting code with the JIT compiler. +// +module { + // + // The kernel expressed as an annotated Linalg op. The kernel multiplies + // a sparse matrix A with a dense vector b into a dense vector x. + // + func @kernel_matvec(%argA: !SparseTensor, + %argb: tensor, + %argx: tensor) -> tensor { + %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor + %0 = linalg.generic #matvec + ins(%arga, %argb: tensor, tensor) + outs(%argx: tensor) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = mulf %a, %b : f32 + %1 = addf %x, %0 : f32 + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor + } + + // + // Runtime support library that is called directly from here. + // + func private @getTensorFilename(index) -> (!Filename) + func private @newSparseTensor(!Filename, memref, index, index, index) -> (!SparseTensor) + func private @delSparseTensor(!SparseTensor) -> () + + // + // Main driver that reads matrix from file and calls the sparse kernel. + // + func @entry() { + %f0 = constant 0.0 : f32 + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c4 = constant 4 : index + %c256 = constant 256 : index + + // Mark inner dimension of the matrix as sparse and encode the + // storage scheme types (this must match the metadata in the + // alias above and compiler switches). In this case, we test + // that 8-bit indices and pointers work correctly. + %annotations = memref.alloc(%c2) : memref + %sparse = constant true + %dense = constant false + memref.store %dense, %annotations[%c0] : memref + memref.store %sparse, %annotations[%c1] : memref + %u8 = constant 4 : index + %f32 = constant 2 : index + + // Read the sparse matrix from file, construct sparse storage. + %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) + %a = call @newSparseTensor(%fileName, %annotations, %u8, %u8, %f32) + : (!Filename, memref, index, index, index) -> (!SparseTensor) + + // Initialize dense vectors. + %bdata = memref.alloc(%c256) : memref + %xdata = memref.alloc(%c4) : memref + scf.for %i = %c0 to %c256 step %c1 { + %k = addi %i, %c1 : index + %l = index_cast %k : index to i32 + %f = sitofp %l : i32 to f32 + memref.store %f, %bdata[%i] : memref + } + scf.for %i = %c0 to %c4 step %c1 { + memref.store %f0, %xdata[%i] : memref + } + %b = memref.tensor_load %bdata : memref + %x = memref.tensor_load %xdata : memref + + // Call kernel. + %0 = call @kernel_matvec(%a, %b, %x) + : (!SparseTensor, tensor, tensor) -> tensor + + // Print the result for verification. + // + // CHECK: ( 1659, 1534, 21, 18315 ) + // + %m = memref.buffer_cast %0 : memref + %v = vector.transfer_read %m[%c0], %f0: memref, vector<4xf32> + vector.print %v : vector<4xf32> + + // Release the resources. + call @delSparseTensor(%a) : (!SparseTensor) -> () + memref.dealloc %bdata : memref + memref.dealloc %xdata : memref + + return + } +} diff --git a/mlir/test/Integration/data/wide.mtx b/mlir/test/Integration/data/wide.mtx new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/data/wide.mtx @@ -0,0 +1,23 @@ +%%MatrixMarket matrix coordinate real general +% +% This is a test sparse matrix in Matrix Market Exchange Format. +% see https://math.nist.gov/MatrixMarket +% +4 256 17 +1 1 1.0 +1 127 2.0 +1 128 3.0 +1 255 4.0 +2 2 5.0 +2 254 6.0 +3 3 7.0 +4 1 8.0 +4 2 9.0 +4 4 10.0 +4 99 11.0 +4 127 12.0 +4 128 13.0 +4 129 14.0 +4 250 15.0 +4 254 16.0 +4 256 17.0