diff --git a/mlir/benchmark/python/common.py b/mlir/benchmark/python/common.py
new file mode 100644
--- /dev/null
+++ b/mlir/benchmark/python/common.py
@@ -0,0 +1,141 @@
+import ctypes
+import numpy as np
+import os
+import re
+import time
+
+from mlir import ir
+from mlir import runtime as rt
+from mlir.dialects import builtin
+from mlir.dialects import arith
+from mlir.dialects import memref
+from mlir.dialects import scf
+from mlir.dialects import std
+from mlir.execution_engine import ExecutionEngine
+from mlir.passmanager import PassManager
+
+
+def create_random_np_tensor(tensor_type):
+  tensor_type_str = str(tensor_type)
+  dimensions_str = re.sub("<|>|tensor", "", tensor_type_str)
+  dimensions = [int(dim) for dim in dimensions_str.split("x")[:-1]]
+  return np.random.uniform(low=0.0, high=100.0, size=dimensions)
+
+
+def create_zero_np_tensor(tensor_type):
+  tensor_type_str = str(tensor_type)
+  dimensions_str = re.sub("<|>|tensor", "", tensor_type_str)
+  dimensions = [int(dim) for dim in dimensions_str.split("x")[:-1]]
+  return np.zeros(dimensions, np.float64)
+
+
+def construct_arguments_for_kernel_function(kernel_func):
+  tensor_np_args = []
+  tensor_np_args.append(create_zero_np_tensor(kernel_func.type.inputs[-1]))
+  for input_type in kernel_func.type.inputs[:-1]:
+    tensor_np_args.append(create_random_np_tensor(input_type))
+  tensor_np_args.append(create_zero_np_tensor(kernel_func.type.inputs[-1]))
+  tensor_mem_args = [
+    ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(np_tensor)))
+    for np_tensor in tensor_np_args
+  ]
+  return tensor_mem_args
+
+
+def emit_timer_func() -> builtin.FuncOp:
+  i64_type = ir.IntegerType.get_signless(64)
+  nano_time = builtin.FuncOp(
+    "nano_time", ([], [i64_type]), visibility="private")
+  nano_time.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+  return nano_time
+
+
+def emit_benchmark_wrapped_main_func(
+  func: builtin.FuncOp,
+  timer_func: builtin.FuncOp
+) -> builtin.FuncOp:
+  i64_type = ir.IntegerType.get_signless(64)
+  memref_of_i64_type = ir.MemRefType.get([-1], i64_type)
+  wrapped_func = builtin.FuncOp(
+    # Same signature and an extra buffer of indices to save timings.
+    "main",
+    (func.arguments.types + [memref_of_i64_type], func.type.results),
+    visibility="public")
+  wrapped_func.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+
+  num_results = len(func.type.results)
+  with ir.InsertionPoint(wrapped_func.add_entry_block()):
+    timer_buffer = wrapped_func.arguments[-1]
+    zero = arith.ConstantOp.create_index(0)
+    n_iterations = memref.DimOp(ir.IndexType.get(), timer_buffer, zero)
+    one = arith.ConstantOp.create_index(1)
+    iter_args = list(wrapped_func.arguments[-num_results - 1:-1])
+    loop = scf.ForOp(zero, n_iterations, one, iter_args)
+    with ir.InsertionPoint(loop.body):
+      start = std.CallOp(timer_func, [])
+      call = std.CallOp(
+        func, wrapped_func.arguments[:-num_results - 1] + loop.inner_iter_args)
+      end = std.CallOp(timer_func, [])
+      time = arith.SubIOp(end, start)
+      memref.StoreOp(time, timer_buffer, [loop.induction_variable])
+      scf.YieldOp(list(call.results))
+    std.ReturnOp(loop)
+
+  return wrapped_func
+
+
+def print_benchmark_report(times_array):
+  print(f"  Number of runs: {len(times_array)}")
+  print(f"  Mean:           {np.mean(times_array)} ms")
+  print(f"  Median:         {np.median(times_array)} ms")
+  print(f"  Fastest:        {np.amin(times_array)} ms")
+  print(f"  p1:             {np.percentile(times_array, 1)} ms")
+  print(f"  p10:            {np.percentile(times_array, 10)} ms")
+  print(f"  p25:            {np.percentile(times_array, 25)} ms")
+  print(f"  p50:            {np.percentile(times_array, 50)} ms")
+  print(f"  p75:            {np.percentile(times_array, 75)} ms")
+  print(f"  p90:            {np.percentile(times_array, 90)} ms")
+  print(f"  p99:            {np.percentile(times_array, 99)} ms")
+  print(f"  Slowest:        {np.amax(times_array)} ms")
+
+
+def benchmark(pipeline, number_of_runs):
+  def wrapper(kernel_module):
+    c_runner_utils = os.getenv("MLIR_C_RUNNER_UTILS", "")
+    assert os.path.exists(c_runner_utils), f"{c_runner_utils} does not exist"
+    runner_utils = os.getenv("MLIR_RUNNER_UTILS", "")
+    assert os.path.exists(runner_utils), f"{runner_utils} does not exist"
+
+    with ir.Context(), ir.Location.unknown():
+      kernel_func_main_module = kernel_module()
+      kernel_func = kernel_func_main_module.operation.regions[0].blocks[0].operations[0]
+      main_module_with_benchmark = ir.Module.create()
+      with ir.InsertionPoint(main_module_with_benchmark.body):
+        timer_func = emit_timer_func()
+        wrapped_func = emit_benchmark_wrapped_main_func(kernel_func, timer_func)
+        main_module_with_benchmark = ir.Module.parse(
+          str(timer_func) + str(wrapped_func) + str(kernel_func)
+        )
+
+      PassManager.parse(pipeline).run(main_module_with_benchmark)
+
+      tensor_mem_args = construct_arguments_for_kernel_function(kernel_func)
+      np_timers_ns = np.zeros([number_of_runs], dtype=np.int64)
+      tensor_mem_args.append(
+        ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(np_timers_ns)))
+      )
+
+      print("")
+      print("*" * 20)
+      print(f"Benchmarking {kernel_func.name}")
+      compilation_time_start_seconds = time.time()
+      engine = ExecutionEngine(main_module_with_benchmark, 3, shared_libs=[c_runner_utils, runner_utils])
+      compilation_time_seconds = time.time() - compilation_time_start_seconds
+      print(f"Compilation time: {compilation_time_seconds}")
+
+      engine.invoke("main", *tensor_mem_args)
+      np_timers_ms = [t * 10**(-6) for t in np_timers_ns]
+      print_benchmark_report(np_timers_ms)
+      print("*" * 20)
+
+  return wrapper
diff --git a/mlir/benchmark/python/sparse.bench.py b/mlir/benchmark/python/sparse.bench.py
new file mode 100644
--- /dev/null
+++ b/mlir/benchmark/python/sparse.bench.py
@@ -0,0 +1,50 @@
+import mlir.all_passes_registration
+
+from mlir import ir
+from mlir.dialects import builtin
+from mlir.dialects.linalg.opdsl import lang as dsl
+from common import benchmark
+
+
+@dsl.linalg_structured_op
+def matmul_dsl(
+  A=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.K),
+  B=dsl.TensorDef(dsl.T, dsl.S.K, dsl.S.N),
+  C=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.N, output=True)
+):
+  C[dsl.D.m, dsl.D.n] += A[dsl.D.m, dsl.D.k] * B[dsl.D.k, dsl.D.n]
+
+
+def get_sparse_kernel_pipeline() -> str:
+  opt = "parallelization-strategy=0 vectorization-strategy=0 vl=1 enable-simd-index32=False"
+  return (
+    f"builtin.func(linalg-generalize-named-ops,linalg-fuse-elementwise-ops),"
+    f"sparsification{{{opt}}},"
+    f"sparse-tensor-conversion,"
+    f"builtin.func(linalg-bufferize,convert-linalg-to-loops,convert-vector-to-scf),"
+    f"convert-scf-to-std,"
+    f"func-bufferize,"
+    f"tensor-constant-bufferize,"
+    f"builtin.func(tensor-bufferize,std-bufferize,finalizing-bufferize),"
+    f"convert-vector-to-llvm{{reassociate-fp-reductions=1 enable-index-optimizations=1}},"
+    f"lower-affine,"
+    f"convert-memref-to-llvm,"
+    f"convert-std-to-llvm,"
+    f"reconcile-unrealized-casts"
+  )
+
+
+@benchmark(get_sparse_kernel_pipeline(), 100)
+def sparse_kernel_module():
+  module = ir.Module.create()
+  f64 = ir.F64Type.get()
+  a = ir.RankedTensorType.get([1000, 1500], f64)
+  b = ir.RankedTensorType.get([1500, 2000], f64)
+  c = ir.RankedTensorType.get([1000, 2000], f64)
+  with ir.InsertionPoint(module.body):
+    @builtin.FuncOp.from_py_func(a, b, c)
+    def sparse_kernel(x, y, z):
+      return matmul_dsl(x, y, outs=[z])
+
+  return module
+