diff --git a/mlir/lib/ExecutionEngine/RunnerUtils.cpp b/mlir/lib/ExecutionEngine/RunnerUtils.cpp --- a/mlir/lib/ExecutionEngine/RunnerUtils.cpp +++ b/mlir/lib/ExecutionEngine/RunnerUtils.cpp @@ -15,6 +15,10 @@ #include "mlir/ExecutionEngine/RunnerUtils.h" +#ifndef _WIN32 +#include +#endif // _WIN32 + extern "C" void _mlir_ciface_print_memref_vector_4x4xf32( StridedMemRefType, 2> *M) { impl::printMemRef(*M); @@ -103,3 +107,22 @@ _mlir_ciface_print_memref_4d_f32(StridedMemRefType *M) { impl::printMemRef(*M); } + +/// Prints GFLOPS rating. +extern "C" void print_flops(double flops) { + fprintf(stderr, "%lf GFLOPS\n", flops / 1.0E9); +} + +/// Returns the number of seconds since Epoch 1970-01-01 00:00:00 +0000 (UTC). +extern "C" double rtclock() { +#ifndef _WIN32 + struct timeval tp; + int stat = gettimeofday(&tp, NULL); + if (stat != 0) + fprintf(stderr, "Error returning time from gettimeofday: %d\n", stat); + return (tp.tv_sec + tp.tv_usec * 1.0e-6); +#else + fprintf(stderr, "Timing utility not implemented on Windows\n"); + return 0.0; +#endif // _WIN32 +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -10,6 +10,7 @@ # Passed to lit.site.cfg.py.in to set up the path where to find the libraries # for linalg integration tests. set(MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) +set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) # Passed to lit.site.cfg.py.in to set up the path where to find the libraries # for the mlir cuda runner tests. diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -66,8 +66,9 @@ ToolSubst('toy-ch3', unresolved='ignore'), ToolSubst('toy-ch4', unresolved='ignore'), ToolSubst('toy-ch5', unresolved='ignore'), - ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'), ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore'), + ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'), + ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'), ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore') ]) diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -30,6 +30,7 @@ config.host_arch = "@HOST_ARCH@" config.mlir_src_root = "@MLIR_SOURCE_DIR@" config.mlir_obj_root = "@MLIR_BINARY_DIR@" +config.mlir_runner_utils_dir = "@MLIR_RUNNER_UTILS_DIR@" config.mlir_tools_dir = "@MLIR_TOOLS_DIR@" config.linalg_test_lib_dir = "@MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR@" config.build_examples = @LLVM_BUILD_EXAMPLES@ diff --git a/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir b/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir @@ -0,0 +1,71 @@ +// RUN: mlir-opt -convert-linalg-to-loops -lower-affine -convert-loop-to-std -convert-std-to-llvm %s | mlir-cpu-runner -O3 -e main -entry-point-result=void -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext | FileCheck %s + +func @main() { + %A = alloc() : memref<64x64xf32> + %B = alloc() : memref<64x64xf32> + %C = alloc() : memref<64x64xf32> + + %cf1 = constant 1.00000e+00 : f32 + + linalg.fill(%A, %cf1) : memref<64x64xf32>, f32 + linalg.fill(%B, %cf1) : memref<64x64xf32>, f32 + + %reps = constant 1 : index + + %t_start = call @rtclock() : () -> f64 + affine.for %arg0 = 0 to 5 { + linalg.fill(%C, %cf1) : memref<64x64xf32>, f32 + call @sgemm_naive(%A, %B, %C) : (memref<64x64xf32>, memref<64x64xf32>, memref<64x64xf32>) -> () + } + %t_end = call @rtclock() : () -> f64 + %t = subf %t_end, %t_start : f64 + + %pC = memref_cast %C : memref<64x64xf32> to memref<*xf32> + call @print_memref_f32(%pC) : (memref<*xf32>) -> () + + %M = dim %C, 0 : memref<64x64xf32> + %N = dim %C, 1 : memref<64x64xf32> + %K = dim %A, 1 : memref<64x64xf32> + + %f1 = muli %M, %N : index + %f2 = muli %f1, %K : index + + // 2*M*N*K. + %c2 = constant 2 : index + %f3 = muli %c2, %f2 : index + %num_flops = muli %reps, %f3 : index + %num_flops_i = index_cast %num_flops : index to i64 + %num_flops_f = sitofp %num_flops_i : i64 to f64 + %flops = divf %num_flops_f, %t : f64 + call @print_flops(%flops) : (f64) -> () + + return +} +// CHECK: 65, 65, 65, + +func @sgemm_naive(%arg0: memref<64x64xf32>, %arg1: memref<64x64xf32>, %arg2: memref<64x64xf32>) { + %c0 = constant 0 : index + affine.for %arg3 = 0 to 64 { + affine.for %arg4 = 0 to 64 { + %m = alloc() : memref<1xf32> + %v = affine.load %arg2[%arg3, %arg4] : memref<64x64xf32> + affine.store %v, %m[%c0] : memref<1xf32> + affine.for %arg5 = 0 to 64 { + %3 = affine.load %arg0[%arg3, %arg5] : memref<64x64xf32> + %4 = affine.load %arg1[%arg5, %arg4] : memref<64x64xf32> + %5 = affine.load %m[0] : memref<1xf32> + %6 = mulf %3, %4 : f32 + %7 = addf %6, %5 : f32 + affine.store %7, %m[0] : memref<1xf32> + } + %s = affine.load %m[%c0] : memref<1xf32> + affine.store %s, %arg2[%arg3, %arg4] : memref<64x64xf32> + dealloc %m : memref<1xf32> + } + } + return +} + +func @print_flops(f64) +func @rtclock() -> f64 +func @print_memref_f32(memref<*xf32>)