diff --git a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp
@@ -76,6 +76,10 @@
       name = "sparsePointers64";
     else if (eltType.isInteger(32))
       name = "sparsePointers32";
+    else if (eltType.isInteger(16))
+      name = "sparsePointers16";
+    else if (eltType.isInteger(8))
+      name = "sparsePointers8";
     else
       return failure();
     rewriter.replaceOpWithNewOp<CallOp>(
@@ -100,6 +104,10 @@
       name = "sparseIndices64";
     else if (eltType.isInteger(32))
       name = "sparseIndices32";
+    else if (eltType.isInteger(16))
+      name = "sparseIndices16";
+    else if (eltType.isInteger(8))
+      name = "sparseIndices8";
     else
       return failure();
     rewriter.replaceOpWithNewOp<CallOp>(
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp
@@ -614,18 +614,21 @@
   }
 }
 
+/// Constructs vector type.
+static VectorType vectorType(CodeGen &codegen, Type etp) {
+  return VectorType::get(codegen.curVecLength, etp);
+}
+
 /// Constructs vector type from pointer.
 static VectorType vectorType(CodeGen &codegen, Value ptr) {
-  Type etp = ptr.getType().cast<MemRefType>().getElementType();
-  return VectorType::get(codegen.curVecLength, etp);
+  return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType());
 }
 
 /// Constructs vector iteration mask.
 static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,
                            Value iv, Value lo, Value hi, Value step) {
   Location loc = iv.getLoc();
-  VectorType mtp =
-      VectorType::get(codegen.curVecLength, rewriter.getIntegerType(1));
+  VectorType mtp = vectorType(codegen, rewriter.getIntegerType(1));
   // Special case if the vector length evenly divides the trip count (for
   // example, "for i = 0, 128, 16"). A constant all-true mask is generated
   // so that all subsequent masked memory operations are immediately folded
@@ -683,7 +686,7 @@
 /// optimizations to hoist the invariant broadcast out of the vector loop.
 static Value genVectorInvariantValue(CodeGen &codegen,
                                      PatternRewriter &rewriter, Value val) {
-  VectorType vtp = VectorType::get(codegen.curVecLength, val.getType());
+  VectorType vtp = vectorType(codegen, val.getType());
   return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
 }
 
@@ -747,15 +750,47 @@
     rewriter.create<memref::StoreOp>(loc, rhs, ptr, args);
 }
 
-/// Generates a pointer/index load from the sparse storage scheme.
+/// Generates a pointer/index load from the sparse storage scheme. Narrower
+/// data types need to be zero extended before casting the value into the
+/// index type used for looping and indexing.
 static Value genLoad(CodeGen &codegen, PatternRewriter &rewriter, Location loc,
                      Value ptr, Value s) {
-  if (codegen.curVecLength > 1)
-    return genVectorLoad(codegen, rewriter, ptr, {s});
+  // See https://llvm.org/docs/GetElementPtr.html for some background on
+  // the complications described below.
+  if (codegen.curVecLength > 1) {
+    // Since the index vector is used in a subsequent gather/scatter operations,
+    // which effectively defines an unsigned pointer + signed index, we must
+    // zero extend the vector to an index width. For 8-bit and 16-bit values,
+    // an 32-bit index width suffices. For 32-bit values, zero extending the
+    // elements into 64-bit loses some performance since the 32-bit indexed
+    // gather/scatter is more efficient than the 64-bit index variant (in
+    // the future, we could introduce a flag that states the negative space
+    // of 32-bit indices is unused). For 64-bit values, there is no good way
+    // to state that the indices are unsigned, with creates the potential of
+    // incorrect address calculations in the unlikely case we need such
+    // extremely large offsets.
+    Type etp = ptr.getType().cast<MemRefType>().getElementType();
+    Value vload = genVectorLoad(codegen, rewriter, ptr, {s});
+    if (etp.getIntOrFloatBitWidth() < 32)
+      vload = rewriter.create<ZeroExtendIOp>(
+          loc, vload, vectorType(codegen, rewriter.getIntegerType(32)));
+    else if (etp.getIntOrFloatBitWidth() < 64)
+      vload = rewriter.create<ZeroExtendIOp>(
+          loc, vload, vectorType(codegen, rewriter.getIntegerType(64)));
+    return vload;
+  }
+  // For the scalar case, we simply zero extend narrower indices into 64-bit
+  // values before casting to index without a performance penalty. Here too,
+  // however, indices that already are 64-bit, in theory, cannot express the
+  // full range as explained above.
   Value load = rewriter.create<memref::LoadOp>(loc, ptr, s);
-  return load.getType().isa<IndexType>()
-             ? load
-             : rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
+  if (!load.getType().isa<IndexType>()) {
+    if (load.getType().getIntOrFloatBitWidth() < 64)
+      load = rewriter.create<ZeroExtendIOp>(loc, load,
+                                            rewriter.getIntegerType(64));
+    load = rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
+  }
+  return load;
 }
 
 /// Generates an invariant value.
@@ -959,8 +994,10 @@
     if (!merger.isSparseTensor(t) && !linkedSparse(op, t)) {
       auto map = op.getIndexingMap(t);
       unsigned r = map.getNumResults();
-      if (r && map.getDimPosition(r - 1) != idx)
-        return false;
+      for (unsigned i = 0; i < r; i++) {
+        if (map.getDimPosition(i) == idx && i != r - 1)
+          return false;
+      }
     }
   }
   return true;
diff --git a/mlir/lib/ExecutionEngine/SparseUtils.cpp b/mlir/lib/ExecutionEngine/SparseUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp
@@ -113,12 +113,21 @@
 class SparseTensorStorageBase {
 public:
   virtual uint64_t getDimSize(uint64_t) = 0;
+
+  // Overhead storage.
   virtual void getPointers(std::vector<uint64_t> **, uint64_t) { fatal("p64"); }
   virtual void getPointers(std::vector<uint32_t> **, uint64_t) { fatal("p32"); }
+  virtual void getPointers(std::vector<uint16_t> **, uint64_t) { fatal("p16"); }
+  virtual void getPointers(std::vector<uint8_t> **, uint64_t) { fatal("p8"); }
   virtual void getIndices(std::vector<uint64_t> **, uint64_t) { fatal("i64"); }
   virtual void getIndices(std::vector<uint32_t> **, uint64_t) { fatal("i32"); }
+  virtual void getIndices(std::vector<uint16_t> **, uint64_t) { fatal("i16"); }
+  virtual void getIndices(std::vector<uint8_t> **, uint64_t) { fatal("i8"); }
+
+  // Primary storage.
   virtual void getValues(std::vector<double> **) { fatal("valf64"); }
   virtual void getValues(std::vector<float> **) { fatal("valf32"); }
+
   virtual ~SparseTensorStorageBase() {}
 
 private:
@@ -464,6 +473,22 @@
   uint64_t strides[1];
 };
 
+struct MemRef1DU16 {
+  const uint16_t *base;
+  const uint16_t *data;
+  uint64_t off;
+  uint64_t sizes[1];
+  uint64_t strides[1];
+};
+
+struct MemRef1DU8 {
+  const uint8_t *base;
+  const uint8_t *data;
+  uint64_t off;
+  uint64_t sizes[1];
+  uint64_t strides[1];
+};
+
 struct MemRef1DF64 {
   const double *base;
   const double *data;
@@ -480,41 +505,42 @@
   uint64_t strides[1];
 };
 
-enum TypeEnum : uint64_t { kF64 = 0, kF32 = 1, kU64 = 2, kU32 = 3 };
+enum OverheadTypeEnum : uint64_t { kU64 = 1, kU32 = 2, kU16 = 3, kU8 = 4 };
+enum PrimaryTypeEnum : uint64_t { kF64 = 1, kF32 = 2 };
+
+#define CASE(p, i, v, P, I, V)                                                 \
+  if (ptrTp == (p) && indTp == (i) && valTp == (v))                            \
+  return newSparseTensor<P, I, V>(filename, sparsity, asize)
 
 void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff,
                       uint64_t asize, uint64_t astride, uint64_t ptrTp,
                       uint64_t indTp, uint64_t valTp) {
   assert(astride == 1);
   bool *sparsity = abase + aoff;
-  if (ptrTp == kU64 && indTp == kU64 && valTp == kF64)
-    return newSparseTensor<uint64_t, uint64_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU64 && indTp == kU64 && valTp == kF32)
-    return newSparseTensor<uint64_t, uint64_t, float>(filename, sparsity,
-                                                      asize);
-  if (ptrTp == kU64 && indTp == kU32 && valTp == kF64)
-    return newSparseTensor<uint64_t, uint32_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU64 && indTp == kU32 && valTp == kF32)
-    return newSparseTensor<uint64_t, uint32_t, float>(filename, sparsity,
-                                                      asize);
-  if (ptrTp == kU32 && indTp == kU64 && valTp == kF64)
-    return newSparseTensor<uint32_t, uint64_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU32 && indTp == kU64 && valTp == kF32)
-    return newSparseTensor<uint32_t, uint64_t, float>(filename, sparsity,
-                                                      asize);
-  if (ptrTp == kU32 && indTp == kU32 && valTp == kF64)
-    return newSparseTensor<uint32_t, uint32_t, double>(filename, sparsity,
-                                                       asize);
-  if (ptrTp == kU32 && indTp == kU32 && valTp == kF32)
-    return newSparseTensor<uint32_t, uint32_t, float>(filename, sparsity,
-                                                      asize);
+
+  // The most common cases: 64-bit or 32-bit overhead, double/float values.
+  CASE(kU64, kU64, kF64, uint64_t, uint64_t, double);
+  CASE(kU64, kU64, kF32, uint64_t, uint64_t, float);
+  CASE(kU64, kU32, kF64, uint64_t, uint32_t, double);
+  CASE(kU64, kU32, kF32, uint64_t, uint32_t, float);
+  CASE(kU32, kU64, kF64, uint32_t, uint64_t, double);
+  CASE(kU32, kU64, kF32, uint32_t, uint64_t, float);
+  CASE(kU32, kU32, kF64, uint32_t, uint32_t, double);
+  CASE(kU32, kU32, kF32, uint32_t, uint32_t, float);
+
+  // Some special cases: low overhead storage, double/float values.
+  CASE(kU16, kU16, kF64, uint16_t, uint16_t, double);
+  CASE(kU8, kU8, kF64, uint8_t, uint8_t, double);
+  CASE(kU16, kU16, kF32, uint16_t, uint16_t, float);
+  CASE(kU8, kU8, kF32, uint8_t, uint8_t, float);
+
+  // Unsupported case (add above if needed).
   fputs("unsupported combination of types\n", stderr);
   exit(1);
 }
 
+#undef CASE
+
 uint64_t sparseDimSize(void *tensor, uint64_t d) {
   return static_cast<SparseTensorStorageBase *>(tensor)->getDimSize(d);
 }
@@ -531,6 +557,18 @@
   return {v->data(), v->data(), 0, {v->size()}, {1}};
 }
 
+MemRef1DU16 sparsePointers16(void *tensor, uint64_t d) {
+  std::vector<uint16_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getPointers(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
+MemRef1DU8 sparsePointers8(void *tensor, uint64_t d) {
+  std::vector<uint8_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getPointers(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
 MemRef1DU64 sparseIndices64(void *tensor, uint64_t d) {
   std::vector<uint64_t> *v;
   static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
@@ -543,6 +581,18 @@
   return {v->data(), v->data(), 0, {v->size()}, {1}};
 }
 
+MemRef1DU16 sparseIndices16(void *tensor, uint64_t d) {
+  std::vector<uint16_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
+MemRef1DU8 sparseIndices8(void *tensor, uint64_t d) {
+  std::vector<uint8_t> *v;
+  static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
+  return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
 MemRef1DF64 sparseValuesF64(void *tensor) {
   std::vector<double> *v;
   static_cast<SparseTensorStorageBase *>(tensor)->getValues(&v);
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -36,6 +36,7 @@
   # Copy test data over.
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.mtx
             ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.tns
+            ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/wide.mtx
           DESTINATION ${MLIR_INTEGRATION_TEST_DIR}/data/)
 endif()
 
diff --git a/mlir/test/Dialect/Linalg/sparse_storage.mlir b/mlir/test/Dialect/Linalg/sparse_storage.mlir
--- a/mlir/test/Dialect/Linalg/sparse_storage.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_storage.mlir
@@ -51,7 +51,8 @@
 // CHECK-TYPE1: %[[B1:.*]] = index_cast %[[P1]] : i64 to index
 // CHECK-TYPE1: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE1:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi32>
-// CHECK-TYPE1:   %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index
+// CHECK-TYPE1:   %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64
+// CHECK-TYPE1:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE1:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE1:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE1:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
@@ -62,9 +63,11 @@
 // CHECK-TYPE2: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE2: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE2: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
-// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[P0]] : i32 to index
+// CHECK-TYPE2: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64
+// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE2: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi32>
-// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[P1]] : i32 to index
+// CHECK-TYPE2: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64
+// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE2: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE2:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi64>
 // CHECK-TYPE2:   %[[INDC:.*]] = index_cast %[[IND0]] : i64 to index
@@ -78,12 +81,15 @@
 // CHECK-TYPE3: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE3: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE3: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
-// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[P0]] : i32 to index
+// CHECK-TYPE3: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64
+// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE3: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi32>
-// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[P1]] : i32 to index
+// CHECK-TYPE3: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64
+// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE3: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE3:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi32>
-// CHECK-TYPE3:   %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index
+// CHECK-TYPE3:   %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64
+// CHECK-TYPE3:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE3:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE3:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE3:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
@@ -94,12 +100,15 @@
 // CHECK-TYPE4: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE4: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE4: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi16>
-// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[P0]] : i16 to index
+// CHECK-TYPE4: %[[Z0:.*]] = zexti %[[P0]] : i16 to i64
+// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE4: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi16>
-// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[P1]] : i16 to index
+// CHECK-TYPE4: %[[Z1:.*]] = zexti %[[P1]] : i16 to i64
+// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE4: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE4:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi16>
-// CHECK-TYPE4:   %[[INDC:.*]] = index_cast %[[IND0]] : i16 to index
+// CHECK-TYPE4:   %[[ZEXT:.*]] = zexti %[[IND0]] : i16 to i64
+// CHECK-TYPE4:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE4:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE4:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE4:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
@@ -110,12 +119,15 @@
 // CHECK-TYPE5: %[[C0:.*]] = constant 0 : index
 // CHECK-TYPE5: %[[C1:.*]] = constant 1 : index
 // CHECK-TYPE5: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi8>
-// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[P0]] : i8 to index
+// CHECK-TYPE5: %[[Z0:.*]] = zexti %[[P0]] : i8 to i64
+// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
 // CHECK-TYPE5: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi8>
-// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[P1]] : i8 to index
+// CHECK-TYPE5: %[[Z1:.*]] = zexti %[[P1]] : i8 to i64
+// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
 // CHECK-TYPE5: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
 // CHECK-TYPE5:   %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi8>
-// CHECK-TYPE5:   %[[INDC:.*]] = index_cast %[[IND0]] : i8 to index
+// CHECK-TYPE5:   %[[ZEXT:.*]] = zexti %[[IND0]] : i8 to i64
+// CHECK-TYPE5:   %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
 // CHECK-TYPE5:   %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
 // CHECK-TYPE5:   %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
 // CHECK-TYPE5:   %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
diff --git a/mlir/test/Dialect/Linalg/sparse_vector.mlir b/mlir/test/Dialect/Linalg/sparse_vector.mlir
--- a/mlir/test/Dialect/Linalg/sparse_vector.mlir
+++ b/mlir/test/Dialect/Linalg/sparse_vector.mlir
@@ -85,12 +85,15 @@
 // CHECK-VEC0-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC0-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC0:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC0:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC0:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC0:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC0:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC0:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC0:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC0:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC0:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC0:         %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0:         %[[ci:.*]] = index_cast %[[li]] : i32 to index
+// CHECK-VEC0:         %[[zi:.*]] = zexti %[[li]] : i32 to i64
+// CHECK-VEC0:         %[[ci:.*]] = index_cast %[[zi]] : i64 to index
 // CHECK-VEC0:         %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
 // CHECK-VEC0:         %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
 // CHECK-VEC0:         %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -102,12 +105,15 @@
 // CHECK-VEC1-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC1-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC1:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC1:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC1:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC1:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC1:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC1:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC1:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC1:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC1:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC1:         %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1:         %[[ci:.*]] = index_cast %[[li]] : i32 to index
+// CHECK-VEC1:         %[[zi:.*]] = zexti %[[li]] : i32 to i64
+// CHECK-VEC1:         %[[ci:.*]] = index_cast %[[zi]] : i64 to index
 // CHECK-VEC1:         %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
 // CHECK-VEC1:         %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
 // CHECK-VEC1:         %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -120,17 +126,20 @@
 // CHECK-VEC2-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC2-DAG:   %[[c16:.*]] = constant 16 : index
 // CHECK-VEC2:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC2:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:         %[[sub:.*]] = subi %[[s]], %[[i]] : index
 // CHECK-VEC2:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:         %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:         %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
 // CHECK-VEC2:         %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
@@ -151,17 +160,20 @@
 // CHECK-VEC2-DAG:   %[[c1:.*]] = constant 1 : index
 // CHECK-VEC2-DAG:   %[[c16:.*]] = constant 16 : index
 // CHECK-VEC2:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2:       %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2:       %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC2:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2:       %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:         %[[sub:.*]] = subi %[[s]], %[[i]] : index
 // CHECK-VEC2:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:         %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:         %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
 // CHECK-VEC2:         %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
@@ -303,13 +315,16 @@
 // CHECK-VEC0-DAG:   %[[c512:.*]] = constant 512 : index
 // CHECK-VEC0:       scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
 // CHECK-VEC0:         %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0:         %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC0:         %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC0:         %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC0:         %[[a:.*]] = addi %[[i]], %[[c1]] : index
 // CHECK-VEC0:         %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC0:         %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC0:         %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC0:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC0:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC0:           %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC0:           %[[cj:.*]] = index_cast %[[lj]] : i32 to index
+// CHECK-VEC0:           %[[zj:.*]] = zexti %[[lj]] : i32 to i64
+// CHECK-VEC0:           %[[cj:.*]] = index_cast %[[zj]] : i64 to index
 // CHECK-VEC0:           %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
 // CHECK-VEC0:           %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
 // CHECK-VEC0:           %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -324,13 +339,16 @@
 // CHECK-VEC1-DAG:   %[[c512:.*]] = constant 512 : index
 // CHECK-VEC1:       scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
 // CHECK-VEC1:         %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1:         %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC1:         %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC1:         %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC1:         %[[a:.*]] = addi %[[i]], %[[c1]] : index
 // CHECK-VEC1:         %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC1:         %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC1:         %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC1:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC1:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
 // CHECK-VEC1:           %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC1:           %[[cj:.*]] = index_cast %[[lj]] : i32 to index
+// CHECK-VEC1:           %[[zj:.*]] = zexti %[[lj]] : i32 to i64
+// CHECK-VEC1:           %[[cj:.*]] = index_cast %[[zj]] : i64 to index
 // CHECK-VEC1:           %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
 // CHECK-VEC1:           %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
 // CHECK-VEC1:           %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
@@ -346,18 +364,21 @@
 // CHECK-VEC2-DAG:   %[[c512:.*]] = constant 512 : index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
 // CHECK-VEC2:         %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC2:         %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2:         %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2:         %[[q:.*]] = index_cast %[[a]] : i64 to index
 // CHECK-VEC2:         %[[a:.*]] = addi %[[i]], %[[c1]] : index
 // CHECK-VEC2:         %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC2:         %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2:         %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
 // CHECK-VEC2:           %[[sub:.*]] = subi %[[s]], %[[j]] : index
 // CHECK-VEC2:           %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:           %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2:           %[[zj:.*]] = zexti %[[lj]] : vector<16xi32> to vector<16xi64>
 // CHECK-VEC2:           %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2:           %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2:           %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-VEC2:           %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2:           vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2:           vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
 // CHECK-VEC2:         }
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
diff --git a/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir b/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
--- a/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
+++ b/mlir/test/Integration/Sparse/CPU/sparse_sampled_matmul.mlir
@@ -66,7 +66,6 @@
   func private @getTensorFilename(index) -> (!Filename)
   func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
   func private @delSparseTensor(!SparseTensor) -> ()
-  func private @print_memref_f32(%ptr : tensor<*xf32>)
 
   //
   // Main driver that reads matrix from file and calls the sparse kernel.
@@ -86,8 +85,8 @@
     %sparse = constant true
     memref.store %sparse, %annotations[%c0] : memref<?xi1>
     memref.store %sparse, %annotations[%c1] : memref<?xi1>
-    %i32 = constant 3 : index
-    %f32 = constant 1 : index
+    %i32 = constant 2 : index
+    %f32 = constant 2 : index
 
     // Setup memory for the dense matrices and initialize.
     %adata = memref.alloc(%c5, %c10) : memref<?x?xf32>
diff --git a/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir b/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir
--- a/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir
+++ b/mlir/test/Integration/Sparse/CPU/sparse_sum.mlir
@@ -58,7 +58,6 @@
   func private @getTensorFilename(index) -> (!Filename)
   func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
   func private @delSparseTensor(!SparseTensor) -> ()
-  func private @print_memref_f64(%ptr : tensor<*xf64>)
 
   //
   // Main driver that reads matrix from file and calls the sparse kernel.
@@ -76,8 +75,8 @@
     %sparse = constant true
     memref.store %sparse, %annotations[%c0] : memref<?xi1>
     memref.store %sparse, %annotations[%c1] : memref<?xi1>
-    %i64 = constant 2 : index
-    %f64 = constant 0 : index
+    %i64 = constant 1 : index
+    %f64 = constant 1 : index
 
     // Setup memory for a single reduction scalar,
     // initialized to zero.
diff --git a/mlir/test/Integration/Sparse/sparse_matvec.mlir b/mlir/test/Integration/Sparse/sparse_matvec.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Sparse/sparse_matvec.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-opt %s \
+// RUN:   --test-sparsification="lower ptr-type=4 ind-type=4" \
+// RUN:   --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize  \
+// RUN:   --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+// RUN: mlir-opt %s \
+// RUN:   --test-sparsification="lower vectorization-strategy=2 ptr-type=4 ind-type=4 vl=16" \
+// RUN:   --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize  \
+// RUN:   --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+//
+// Use descriptive names for opaque pointers.
+//
+!Filename     = type !llvm.ptr<i8>
+!SparseTensor = type !llvm.ptr<i8>
+
+#matvec = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>, // A
+    affine_map<(i,j) -> (j)>,   // b
+    affine_map<(i,j) -> (i)>    // x (out)
+  ],
+  sparse = [
+    [ "D", "S" ], // A
+    [ "D"      ], // b
+    [ "D"      ]  // x
+  ],
+  iterator_types = ["parallel", "reduction"],
+  doc = "X(i) += A(i,j) * B(j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  //
+  // The kernel expressed as an annotated Linalg op. The kernel multiplies
+  // a sparse matrix A with a dense vector b into a dense vector x.
+  //
+  func @kernel_matvec(%argA: !SparseTensor,
+                      %argb: tensor<?xf32>,
+                      %argx: tensor<?xf32>) -> tensor<?xf32> {
+    %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor<?x?xf32>
+    %0 = linalg.generic #matvec
+      ins(%arga, %argb: tensor<?x?xf32>, tensor<?xf32>)
+      outs(%argx: tensor<?xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = mulf %a, %b : f32
+        %1 = addf %x, %0 : f32
+        linalg.yield %1 : f32
+    } -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
+
+  //
+  // Runtime support library that is called directly from here.
+  //
+  func private @getTensorFilename(index) -> (!Filename)
+  func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
+  func private @delSparseTensor(!SparseTensor) -> ()
+
+  //
+  // Main driver that reads matrix from file and calls the sparse kernel.
+  //
+  func @entry() {
+    %f0 = constant 0.0 : f32
+    %c0 = constant 0 : index
+    %c1 = constant 1 : index
+    %c2 = constant 2 : index
+    %c4 = constant 4 : index
+    %c256 = constant 256 : index
+
+    // Mark inner dimension of the matrix as sparse and encode the
+    // storage scheme types (this must match the metadata in the
+    // alias above and compiler switches). In this case, we test
+    // that 8-bit indices and pointers work correctly.
+    %annotations = memref.alloc(%c2) : memref<?xi1>
+    %sparse = constant true
+    %dense = constant false
+    memref.store %dense, %annotations[%c0] : memref<?xi1>
+    memref.store %sparse, %annotations[%c1] : memref<?xi1>
+    %u8 = constant 4 : index
+    %f32 = constant 2 : index
+
+    // Read the sparse matrix from file, construct sparse storage.
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %a = call @newSparseTensor(%fileName, %annotations, %u8, %u8, %f32)
+      : (!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
+
+    // Initialize dense vectors.
+    %bdata = memref.alloc(%c256) : memref<?xf32>
+    %xdata = memref.alloc(%c4) : memref<?xf32>
+    scf.for %i = %c0 to %c256 step %c1 {
+      %k = addi %i, %c1 : index
+      %l = index_cast %k : index to i32
+      %f = sitofp %l : i32 to f32
+      memref.store %f, %bdata[%i] : memref<?xf32>
+    }
+    scf.for %i = %c0 to %c4 step %c1 {
+      memref.store %f0, %xdata[%i] : memref<?xf32>
+    }
+    %b = memref.tensor_load %bdata : memref<?xf32>
+    %x = memref.tensor_load %xdata : memref<?xf32>
+
+    // Call kernel.
+    %0 = call @kernel_matvec(%a, %b, %x)
+      : (!SparseTensor, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+    // Print the result for verification.
+    //
+    // CHECK: ( 1659, 1534, 21, 18315 )
+    //
+    %m = memref.buffer_cast %0 : memref<?xf32>
+    %v = vector.transfer_read %m[%c0], %f0: memref<?xf32>, vector<4xf32>
+    vector.print %v : vector<4xf32>
+
+    // Release the resources.
+    call @delSparseTensor(%a) : (!SparseTensor) -> ()
+    memref.dealloc %bdata : memref<?xf32>
+    memref.dealloc %xdata : memref<?xf32>
+
+    return
+  }
+}
diff --git a/mlir/test/Integration/data/wide.mtx b/mlir/test/Integration/data/wide.mtx
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/data/wide.mtx
@@ -0,0 +1,23 @@
+%%MatrixMarket matrix coordinate real general
+%
+% This is a test sparse matrix in Matrix Market Exchange Format.
+% see https://math.nist.gov/MatrixMarket
+%
+4 256 17
+1 1     1.0
+1 127   2.0
+1 128   3.0
+1 255   4.0
+2 2     5.0
+2 254   6.0
+3 3     7.0
+4 1     8.0
+4 2     9.0
+4 4    10.0
+4 99   11.0
+4 127  12.0
+4 128  13.0
+4 129  14.0
+4 250  15.0
+4 254  16.0
+4 256  17.0