Index: mlir/include/mlir/ExecutionEngine/RunnerUtils.h =================================================================== --- mlir/include/mlir/ExecutionEngine/RunnerUtils.h +++ mlir/include/mlir/ExecutionEngine/RunnerUtils.h @@ -31,6 +31,7 @@ #endif // _WIN32 #include +#include #include #include "mlir/ExecutionEngine/CRunnerUtils.h" @@ -73,11 +74,13 @@ template std::ostream &operator<<(std::ostream &os, const Vector &v); -template struct StaticSizeMult { +template +struct StaticSizeMult { static constexpr int value = 1; }; -template struct StaticSizeMult { +template +struct StaticSizeMult { static constexpr int value = N * StaticSizeMult::value; }; @@ -87,7 +90,8 @@ } } -template struct VectorDataPrinter { +template +struct VectorDataPrinter { static void print(std::ostream &os, const Vector &val); }; @@ -211,6 +215,113 @@ std::cout << "Unranked Memref "; printMemRef(DynamicMemRefType(M)); } + +/// Verify the result of two computations are equivalent up to a small +/// numerical error and return the number of errors. +template +struct MemRefDataVerifier { + /// Maximum number of errors printed by the verifier. + static constexpr int printLimit = 10; + + /// Verify the relative difference of the values is smaller than epsilon. + static bool verifyRelErrorSmallerThan(T actual, T expected, T epsilon); + + /// Verify the values are equivalent (integers) or are close (floating-point). + static bool verifyElem(T actual, T expected); + + /// Verify the data element-by-element and return the number of errors. + static int64_t verify(std::ostream &os, T *actualBasePtr, T *expectedBasePtr, + int64_t dim, int64_t offset, const int64_t *sizes, + const int64_t *strides, int64_t &printCounter); +}; + +template +bool MemRefDataVerifier::verifyRelErrorSmallerThan(T actual, T expected, + T epsilon) { + // Return an error if one of the values is infinite or NaN. + if (!std::isfinite(actual) || !std::isfinite(expected)) + return false; + // Return true if the relative error is smaller than epsilon. + T delta = std::abs(actual - expected); + return (delta <= epsilon * std::abs(expected)); +} + +template +bool MemRefDataVerifier::verifyElem(T actual, T expected) { + return actual == expected; +} + +template <> +inline bool MemRefDataVerifier::verifyElem(double actual, + double expected) { + return verifyRelErrorSmallerThan(actual, expected, 1e-12); +} + +template <> +inline bool MemRefDataVerifier::verifyElem(float actual, + float expected) { + return verifyRelErrorSmallerThan(actual, expected, 1e-6f); +} + +template +int64_t MemRefDataVerifier::verify(std::ostream &os, T *actualBasePtr, + T *expectedBasePtr, int64_t dim, + int64_t offset, const int64_t *sizes, + const int64_t *strides, + int64_t &printCounter) { + int64_t errors = 0; + // Verify the elements at the current offset. + if (dim == 0) { + if (!verifyElem(actualBasePtr[offset], expectedBasePtr[offset])) { + if (printCounter < printLimit) { + os << actualBasePtr[offset] << " != " << expectedBasePtr[offset] + << " offset = " << offset << "\n"; + printCounter++; + } + errors++; + } + } else { + // Iterate the current dimension and verify recursively. + for (int64_t i = 0; i < sizes[0]; ++i) { + errors += + verify(os, actualBasePtr, expectedBasePtr, dim - 1, + offset + i * strides[0], sizes + 1, strides + 1, printCounter); + } + } + return errors; +} + +/// Verify the equivalence of two dynamic memrefs and return the number of +/// errors or -1 if the shape of the memrefs do not match. +template +int64_t verifyMemRef(const DynamicMemRefType &actual, + const DynamicMemRefType &expected) { + // Check if the memref shapes match. + for (int64_t i = 0; i < actual.rank; ++i) { + if (expected.rank != actual.rank || actual.offset != expected.offset || + actual.sizes[i] != expected.sizes[i] || + actual.strides[i] != expected.strides[i]) { + printMemRefMetaData(std::cerr, actual); + printMemRefMetaData(std::cerr, expected); + return -1; + } + } + // Return the number of errors. + int64_t printCounter = 0; + return MemRefDataVerifier::verify( + std::cerr, actual.basePtr, expected.basePtr, actual.rank, actual.offset, + actual.sizes, actual.strides, printCounter); +} + +/// Verify the equivalence of two unranked memrefs and return the number of +/// errors or -1 if the shape of the memrefs do not match. +template +int64_t verifyMemRef(UnrankedMemRefType &actual, + UnrankedMemRefType &expected) { + return verifyMemRef(DynamicMemRefType(actual), + DynamicMemRefType(expected)); +} + } // namespace impl //////////////////////////////////////////////////////////////////////////////// @@ -247,4 +358,21 @@ _mlir_ciface_print_memref_vector_4x4xf32( StridedMemRefType, 2> *M); +extern "C" MLIR_RUNNERUTILS_EXPORT int64_t _mlir_ciface_verifyMemRefI32( + UnrankedMemRefType *actual, UnrankedMemRefType *expected); +extern "C" MLIR_RUNNERUTILS_EXPORT int64_t _mlir_ciface_verifyMemRefF32( + UnrankedMemRefType *actual, UnrankedMemRefType *expected); +extern "C" MLIR_RUNNERUTILS_EXPORT int64_t _mlir_ciface_verifyMemRefF64( + UnrankedMemRefType *actual, UnrankedMemRefType *expected); + +extern "C" MLIR_RUNNERUTILS_EXPORT int64_t verifyMemRefI32(int64_t rank, + void *actualPtr, + void *expectedPtr); +extern "C" MLIR_RUNNERUTILS_EXPORT int64_t verifyMemRefF32(int64_t rank, + void *actualPtr, + void *expectedPtr); +extern "C" MLIR_RUNNERUTILS_EXPORT int64_t verifyMemRefF64(int64_t rank, + void *actualPtr, + void *expectedPtr); + #endif // EXECUTIONENGINE_RUNNERUTILS_H_ Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir @@ -1,6 +1,6 @@ // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \ @@ -9,6 +9,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -87,9 +88,16 @@ %tmatmul = subf %t_end_matmul, %t_start_matmul: f64 call @print_perf(%iters, %tmatmul) : (index, f64) -> () - %res = load %C[%c0, %c0]: !row_major_C - // CHECK: 64 - vector.print %res: f32 + // CHECK: {{^0$}} + %C_ref = alloc() : !row_major_C + linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c + linalg.matmul ins(%A, %B : !row_major_A, !row_major_B) + outs(%C_ref: !row_major_C) + %act = memref_cast %C : !row_major_C to memref<*xf32> + %exp = memref_cast %C_ref : !row_major_C to memref<*xf32> + %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64 + vector.print %errors : i64 + dealloc %C_ref : !row_major_C dealloc %A : !row_major_A dealloc %B : !row_major_B @@ -99,6 +107,7 @@ } func private @rtclock() -> f64 +func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir @@ -1,6 +1,6 @@ // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \ // TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed. @@ -11,6 +11,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -84,9 +85,16 @@ %tmatmul_column_major = subf %t_end_matmul_column_major, %t_start_matmul_column_major: f64 call @print_perf(%iters, %tmatmul_column_major) : (index, f64) -> () - %res = load %cC[%c0, %c0]: !column_major_C - // CHECK: 64 - vector.print %res: !elem_type_c + // CHECK: {{^0$}} + %cC_ref = alloc() : !column_major_C + linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c + linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B) + outs(%cC_ref: !column_major_C) + %act = memref_cast %cC : !column_major_C to memref<*xf32> + %exp = memref_cast %cC_ref : !column_major_C to memref<*xf32> + %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64 + vector.print %errors : i64 + dealloc %cC_ref : !column_major_C dealloc %cA : !column_major_A dealloc %cB : !column_major_B @@ -96,6 +104,7 @@ } func private @rtclock() -> f64 +func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir @@ -1,7 +1,7 @@ // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \ // TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed. @@ -12,6 +12,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -95,12 +96,27 @@ %tmatmul_column_major_as_row_major = subf %t_end_matmul_column_major_as_row_major, %t_start_matmul_column_major_as_row_major: f64 call @print_perf(%iters, %tmatmul_column_major_as_row_major) : (index, f64) -> () - %res = load %cC[%c0, %c0]: !column_major_C - // CHECK: 64 - vector.print %res: !elem_type_c - %res2 = load %C[%c0, %c0]: !row_major_C - // CHECK: 64 - vector.print %res2: !elem_type_c + // CHECK: {{^0$}} + %cC_ref = alloc() : !column_major_C + linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c + linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B) + outs(%cC_ref: !column_major_C) + %act1 = memref_cast %cC : !column_major_C to memref<*xf32> + %exp1 = memref_cast %cC_ref : !column_major_C to memref<*xf32> + %errors1 = call @verifyMemRefF32(%act1, %exp1) : (memref<*xf32>, memref<*xf32>) -> i64 + vector.print %errors1 : i64 + dealloc %cC_ref : !column_major_C + + // CHECK: {{^0$}} + %C_ref = alloc() : !row_major_C + linalg.fill(%C_ref, %f0) : !row_major_C, !elem_type_c + linalg.matmul ins(%A, %B : !row_major_A, !row_major_B) + outs(%C_ref: !row_major_C) + %act2 = memref_cast %C : !row_major_C to memref<*xf32> + %exp2 = memref_cast %C_ref : !row_major_C to memref<*xf32> + %errors2 = call @verifyMemRefF32(%act2, %exp2) : (memref<*xf32>, memref<*xf32>) -> i64 + vector.print %errors2 : i64 + dealloc %C_ref : !row_major_C dealloc %A : !row_major_A dealloc %B : !row_major_B @@ -114,6 +130,7 @@ } func private @rtclock() -> f64 +func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir @@ -9,6 +9,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -85,9 +86,16 @@ %tmatmul = subf %t_end_matmul, %t_start_matmul: f64 call @print_perf(%iters, %tmatmul) : (index, f64) -> () - %res = load %C[%c0, %c0]: !row_major_C - // CHECK: 64 - vector.print %res: !elem_type_c + // CHECK: {{^0$}} + %C_ref = alloc() : !row_major_C + linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c + linalg.matmul_i8_i8_i32 ins(%A, %B : !row_major_A, !row_major_B) + outs(%C_ref: !row_major_C) + %res = memref_cast %C : !row_major_C to memref<*xi32> + %exp = memref_cast %C_ref : !row_major_C to memref<*xi32> + %errors = call @verifyMemRefI32(%res, %exp) : (memref<*xi32>, memref<*xi32>) -> i64 + vector.print %errors : i64 + dealloc %C_ref : !row_major_C dealloc %A : !row_major_A dealloc %B : !row_major_B @@ -97,6 +105,7 @@ } func private @rtclock() -> f64 +func private @verifyMemRefI32(memref<*xi32>, memref<*xi32>) -> i64 attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/lib/ExecutionEngine/RunnerUtils.cpp =================================================================== --- mlir/lib/ExecutionEngine/RunnerUtils.cpp +++ mlir/lib/ExecutionEngine/RunnerUtils.cpp @@ -80,3 +80,42 @@ _mlir_ciface_print_memref_4d_f32(StridedMemRefType *M) { impl::printMemRef(*M); } + +extern "C" int64_t +_mlir_ciface_verifyMemRefI32(UnrankedMemRefType *actual, + UnrankedMemRefType *expected) { + return impl::verifyMemRef(*actual, *expected); +} + +extern "C" int64_t +_mlir_ciface_verifyMemRefF32(UnrankedMemRefType *actual, + UnrankedMemRefType *expected) { + return impl::verifyMemRef(*actual, *expected); +} + +extern "C" int64_t +_mlir_ciface_verifyMemRefF64(UnrankedMemRefType *actual, + UnrankedMemRefType *expected) { + return impl::verifyMemRef(*actual, *expected); +} + +extern "C" int64_t verifyMemRefI32(int64_t rank, void *actualPtr, + void *expectedPtr) { + UnrankedMemRefType actualDesc = {rank, actualPtr}; + UnrankedMemRefType expectedDesc = {rank, expectedPtr}; + return _mlir_ciface_verifyMemRefI32(&actualDesc, &expectedDesc); +} + +extern "C" int64_t verifyMemRefF32(int64_t rank, void *actualPtr, + void *expectedPtr) { + UnrankedMemRefType actualDesc = {rank, actualPtr}; + UnrankedMemRefType expectedDesc = {rank, expectedPtr}; + return _mlir_ciface_verifyMemRefF32(&actualDesc, &expectedDesc); +} + +extern "C" int64_t verifyMemRefF64(int64_t rank, void *actualPtr, + void *expectedPtr) { + UnrankedMemRefType actualDesc = {rank, actualPtr}; + UnrankedMemRefType expectedDesc = {rank, expectedPtr}; + return _mlir_ciface_verifyMemRefF64(&actualDesc, &expectedDesc); +}