diff --git a/mlir/benchmark/python/common.py b/mlir/benchmark/python/common.py new file mode 100644 --- /dev/null +++ b/mlir/benchmark/python/common.py @@ -0,0 +1,141 @@ +import ctypes +import numpy as np +import os +import re +import time + +from mlir import ir +from mlir import runtime as rt +from mlir.dialects import builtin +from mlir.dialects import arith +from mlir.dialects import memref +from mlir.dialects import scf +from mlir.dialects import std +from mlir.execution_engine import ExecutionEngine +from mlir.passmanager import PassManager + + +def create_random_np_tensor(tensor_type): + tensor_type_str = str(tensor_type) + dimensions_str = re.sub("<|>|tensor", "", tensor_type_str) + dimensions = [int(dim) for dim in dimensions_str.split("x")[:-1]] + return np.random.uniform(low=0.0, high=100.0, size=dimensions) + + +def create_zero_np_tensor(tensor_type): + tensor_type_str = str(tensor_type) + dimensions_str = re.sub("<|>|tensor", "", tensor_type_str) + dimensions = [int(dim) for dim in dimensions_str.split("x")[:-1]] + return np.zeros(dimensions, np.float64) + + +def construct_arguments_for_kernel_function(kernel_func): + tensor_np_args = [] + tensor_np_args.append(create_zero_np_tensor(kernel_func.type.inputs[-1])) + for input_type in kernel_func.type.inputs[:-1]: + tensor_np_args.append(create_random_np_tensor(input_type)) + tensor_np_args.append(create_zero_np_tensor(kernel_func.type.inputs[-1])) + tensor_mem_args = [ + ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(np_tensor))) + for np_tensor in tensor_np_args + ] + return tensor_mem_args + + +def emit_timer_func() -> builtin.FuncOp: + i64_type = ir.IntegerType.get_signless(64) + nano_time = builtin.FuncOp( + "nano_time", ([], [i64_type]), visibility="private") + nano_time.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + return nano_time + + +def emit_benchmark_wrapped_main_func( + func: builtin.FuncOp, + timer_func: builtin.FuncOp +) -> builtin.FuncOp: + i64_type = ir.IntegerType.get_signless(64) + memref_of_i64_type = ir.MemRefType.get([-1], i64_type) + wrapped_func = builtin.FuncOp( + # Same signature and an extra buffer of indices to save timings. + "main", + (func.arguments.types + [memref_of_i64_type], func.type.results), + visibility="public") + wrapped_func.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + + num_results = len(func.type.results) + with ir.InsertionPoint(wrapped_func.add_entry_block()): + timer_buffer = wrapped_func.arguments[-1] + zero = arith.ConstantOp.create_index(0) + n_iterations = memref.DimOp(ir.IndexType.get(), timer_buffer, zero) + one = arith.ConstantOp.create_index(1) + iter_args = list(wrapped_func.arguments[-num_results - 1:-1]) + loop = scf.ForOp(zero, n_iterations, one, iter_args) + with ir.InsertionPoint(loop.body): + start = std.CallOp(timer_func, []) + call = std.CallOp( + func, wrapped_func.arguments[:-num_results - 1] + loop.inner_iter_args) + end = std.CallOp(timer_func, []) + time = arith.SubIOp(end, start) + memref.StoreOp(time, timer_buffer, [loop.induction_variable]) + scf.YieldOp(list(call.results)) + std.ReturnOp(loop) + + return wrapped_func + + +def print_benchmark_report(times_array): + print(f" Number of runs: {len(times_array)}") + print(f" Mean: {np.mean(times_array)} ms") + print(f" Median: {np.median(times_array)} ms") + print(f" Fastest: {np.amin(times_array)} ms") + print(f" p1: {np.percentile(times_array, 1)} ms") + print(f" p10: {np.percentile(times_array, 10)} ms") + print(f" p25: {np.percentile(times_array, 25)} ms") + print(f" p50: {np.percentile(times_array, 50)} ms") + print(f" p75: {np.percentile(times_array, 75)} ms") + print(f" p90: {np.percentile(times_array, 90)} ms") + print(f" p99: {np.percentile(times_array, 99)} ms") + print(f" Slowest: {np.amax(times_array)} ms") + + +def benchmark(pipeline, number_of_runs): + def wrapper(kernel_module): + c_runner_utils = os.getenv("MLIR_C_RUNNER_UTILS", "") + assert os.path.exists(c_runner_utils), f"{c_runner_utils} does not exist" + runner_utils = os.getenv("MLIR_RUNNER_UTILS", "") + assert os.path.exists(runner_utils), f"{runner_utils} does not exist" + + with ir.Context(), ir.Location.unknown(): + kernel_func_main_module = kernel_module() + kernel_func = kernel_func_main_module.operation.regions[0].blocks[0].operations[0] + main_module_with_benchmark = ir.Module.create() + with ir.InsertionPoint(main_module_with_benchmark.body): + timer_func = emit_timer_func() + wrapped_func = emit_benchmark_wrapped_main_func(kernel_func, timer_func) + main_module_with_benchmark = ir.Module.parse( + str(timer_func) + str(wrapped_func) + str(kernel_func) + ) + + PassManager.parse(pipeline).run(main_module_with_benchmark) + + tensor_mem_args = construct_arguments_for_kernel_function(kernel_func) + np_timers_ns = np.zeros([number_of_runs], dtype=np.int64) + tensor_mem_args.append( + ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(np_timers_ns))) + ) + + print("") + print("*" * 20) + print(f"Benchmarking {kernel_func.name}") + compilation_time_start_seconds = time.time() + engine = ExecutionEngine(main_module_with_benchmark, 3, shared_libs=[c_runner_utils, runner_utils]) + compilation_time_seconds = time.time() - compilation_time_start_seconds + print(f"Compilation time: {compilation_time_seconds}") + + engine.invoke("main", *tensor_mem_args) + np_timers_ms = [t * 10**(-6) for t in np_timers_ns] + print_benchmark_report(np_timers_ms) + print("*" * 20) + + return wrapper diff --git a/mlir/benchmark/python/sparse.bench.py b/mlir/benchmark/python/sparse.bench.py new file mode 100644 --- /dev/null +++ b/mlir/benchmark/python/sparse.bench.py @@ -0,0 +1,50 @@ +import mlir.all_passes_registration + +from mlir import ir +from mlir.dialects import builtin +from mlir.dialects.linalg.opdsl import lang as dsl +from common import benchmark + + +@dsl.linalg_structured_op +def matmul_dsl( + A=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.K), + B=dsl.TensorDef(dsl.T, dsl.S.K, dsl.S.N), + C=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.N, output=True) +): + C[dsl.D.m, dsl.D.n] += A[dsl.D.m, dsl.D.k] * B[dsl.D.k, dsl.D.n] + + +def get_sparse_kernel_pipeline() -> str: + opt = "parallelization-strategy=0 vectorization-strategy=0 vl=1 enable-simd-index32=False" + return ( + f"builtin.func(linalg-generalize-named-ops,linalg-fuse-elementwise-ops)," + f"sparsification{{{opt}}}," + f"sparse-tensor-conversion," + f"builtin.func(linalg-bufferize,convert-linalg-to-loops,convert-vector-to-scf)," + f"convert-scf-to-std," + f"func-bufferize," + f"tensor-constant-bufferize," + f"builtin.func(tensor-bufferize,std-bufferize,finalizing-bufferize)," + f"convert-vector-to-llvm{{reassociate-fp-reductions=1 enable-index-optimizations=1}}," + f"lower-affine," + f"convert-memref-to-llvm," + f"convert-std-to-llvm," + f"reconcile-unrealized-casts" + ) + + +@benchmark(get_sparse_kernel_pipeline(), 100) +def sparse_kernel_module(): + module = ir.Module.create() + f64 = ir.F64Type.get() + a = ir.RankedTensorType.get([1000, 1500], f64) + b = ir.RankedTensorType.get([1500, 2000], f64) + c = ir.RankedTensorType.get([1000, 2000], f64) + with ir.InsertionPoint(module.body): + @builtin.FuncOp.from_py_func(a, b, c) + def sparse_kernel(x, y, z): + return matmul_dsl(x, y, outs=[z]) + + return module +