Index: mlir/include/mlir/ExecutionEngine/RunnerUtils.h =================================================================== --- mlir/include/mlir/ExecutionEngine/RunnerUtils.h +++ mlir/include/mlir/ExecutionEngine/RunnerUtils.h @@ -31,6 +31,7 @@ #endif // _WIN32 #include +#include #include #include "mlir/ExecutionEngine/CRunnerUtils.h" @@ -73,11 +74,13 @@ template std::ostream &operator<<(std::ostream &os, const Vector &v); -template struct StaticSizeMult { +template +struct StaticSizeMult { static constexpr int value = 1; }; -template struct StaticSizeMult { +template +struct StaticSizeMult { static constexpr int value = N * StaticSizeMult::value; }; @@ -87,7 +90,8 @@ } } -template struct VectorDataPrinter { +template +struct VectorDataPrinter { static void print(std::ostream &os, const Vector &val); }; @@ -211,6 +215,111 @@ std::cout << "Unranked Memref "; printMemRef(DynamicMemRefType(M)); } + +/// Verify the results of two computations are equivalent up to a small +/// numerical error. +template +struct MemRefDataVerifier { + /// Maximum number of errors printed by the verifier. + static constexpr int errorLimit = 10; + + /// Verify the relative difference of the values is smaller than epsilon. + static bool verifyRelErrorSmallerThan(T val1, T val2, T epsilon); + + /// Verify the values are equivalent (integers) or are close (floating-point). + static bool verifyElem(T val1, T val2); + + /// Verify the data element-by-element. + static void verify(std::ostream &os, T *base1, T *base2, int64_t dim, + int64_t offset, const int64_t *sizes, + const int64_t *strides, int64_t &errors); +}; + +template +bool MemRefDataVerifier::verifyRelErrorSmallerThan(T val1, T val2, + T epsilon) { + // Return an error if one of the values is infinite or NaN. + if (!std::isfinite(val1) || !std::isfinite(val2)) + return false; + // Return true if the relative error is smaller than epsilon. + T delta = std::abs(val1 - val2); + T maximum = std::max(std::abs(val1), std::abs(val2)); + if (delta > epsilon * maximum) + return false; + return true; +} + +template +bool MemRefDataVerifier::verifyElem(T val1, T val2) { + return val1 == val2; +} + +template <> +inline bool MemRefDataVerifier::verifyElem(double val1, double val2) { + return verifyRelErrorSmallerThan(val1, val2, 1e-12); +} + +template <> +inline bool MemRefDataVerifier::verifyElem(float val1, float val2) { + return verifyRelErrorSmallerThan(val1, val2, 1e-6); +} + +template +void MemRefDataVerifier::verify(std::ostream &os, T *base1, T *base2, + int64_t dim, int64_t offset, + const int64_t *sizes, const int64_t *strides, + int64_t &errors) { + // Verify the elements at the current offset. + if (dim == 0) { + if (!verifyElem(base1[offset], base2[offset])) { + if (errors < errorLimit) { + os << base1[offset] << " != " << base2[offset] << " offset = " << offset + << "\n"; + } else if (errors == errorLimit) { + os << "...\n"; + } + errors++; + } + return; + } + // Iterate the current dimension and verify recursively. + for (int64_t i = 0; i < sizes[0]; ++i) { + verify(os, base1, base2, dim - 1, offset + i * strides[0], sizes + 1, + strides + 1, errors); + } +} + +/// Verify the equivalence of two dynamic memrefs. +template +void verifyMemRef(const DynamicMemRefType &M1, + const DynamicMemRefType &M2) { + // Check the shapes of the MemRefs match. + for (int64_t i = 0; i < M1.rank; ++i) { + if (M2.rank != M1.rank || M1.offset != M2.offset || + M1.sizes[i] != M2.sizes[i] || M1.strides[i] != M2.strides[i]) { + printMemRefMetaData(std::cout, M1); + printMemRefMetaData(std::cout, M2); + std::cout << "FAILED: MemRef shape missmatch!\n"; + return; + } + } + // Count the errors and print the verification result. + int64_t errors = 0; + MemRefDataVerifier::verify(std::cout, M1.basePtr, M2.basePtr, M1.rank, + M1.offset, M1.sizes, M1.strides, errors); + if (errors != 0) { + std::cout << "FAILED: " << errors << " errors in total!\n"; + return; + } + std::cout << "SUCCESS\n"; +} + +/// Verify the equivalence of two unranked memrefs. +template +void verifyMemRef(UnrankedMemRefType &M1, UnrankedMemRefType &M2) { + verifyMemRef(DynamicMemRefType(M1), DynamicMemRefType(M2)); +} + } // namespace impl //////////////////////////////////////////////////////////////////////////////// @@ -247,4 +356,21 @@ _mlir_ciface_print_memref_vector_4x4xf32( StridedMemRefType, 2> *M); +extern "C" MLIR_RUNNERUTILS_EXPORT void +_mlir_ciface_verify_memref_i32(UnrankedMemRefType *M1, + UnrankedMemRefType *M2); +extern "C" MLIR_RUNNERUTILS_EXPORT void +_mlir_ciface_verify_memref_f32(UnrankedMemRefType *M1, + UnrankedMemRefType *M2); +extern "C" MLIR_RUNNERUTILS_EXPORT void +_mlir_ciface_verify_memref_f64(UnrankedMemRefType *M1, + UnrankedMemRefType *M2); + +extern "C" MLIR_RUNNERUTILS_EXPORT void +verify_memref_i32(int64_t rank, void *ptr1, void *ptr2); +extern "C" MLIR_RUNNERUTILS_EXPORT void +verify_memref_f32(int64_t rank, void *ptr1, void *ptr2); +extern "C" MLIR_RUNNERUTILS_EXPORT void +verify_memref_f64(int64_t rank, void *ptr1, void *ptr2); + #endif // EXECUTIONENGINE_RUNNERUTILS_H_ Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul.mlir @@ -1,6 +1,6 @@ // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \ @@ -9,6 +9,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -57,10 +58,12 @@ %A = alloc() : !row_major_A %B = alloc() : !row_major_B %C = alloc() : !row_major_C + %C_ref = alloc() : !row_major_C linalg.fill(%A, %v1) : !row_major_A, !elem_type_a linalg.fill(%B, %v1) : !row_major_B, !elem_type_b linalg.fill(%C, %v0) : !row_major_C, !elem_type_c + linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c %c0 = constant 0: index %c1 = constant 1: index @@ -87,18 +90,23 @@ %tmatmul = subf %t_end_matmul, %t_start_matmul: f64 call @print_perf(%iters, %tmatmul) : (index, f64) -> () - %res = load %C[%c0, %c0]: !row_major_C - // CHECK: 64 - vector.print %res: f32 + // CHECK: SUCCESS + linalg.matmul ins(%A, %B : !row_major_A, !row_major_B) + outs(%C_ref: !row_major_C) + %res = memref_cast %C : !row_major_C to memref<*xf32> + %exp = memref_cast %C_ref : !row_major_C to memref<*xf32> + call @verify_memref_f32(%res, %exp) : (memref<*xf32>, memref<*xf32>) -> () dealloc %A : !row_major_A dealloc %B : !row_major_B dealloc %C : !row_major_C + dealloc %C_ref : !row_major_C return } func private @rtclock() -> f64 +func private @verify_memref_f32(memref<*xf32>, memref<*xf32>) attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major.mlir @@ -1,6 +1,6 @@ // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \ // TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed. @@ -11,6 +11,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -61,10 +62,12 @@ %cA = alloc() : !column_major_A %cB = alloc() : !column_major_B %cC = alloc() : !column_major_C + %cC_ref = alloc() : !column_major_C linalg.fill(%cA, %f1) : !column_major_A, !elem_type_a linalg.fill(%cB, %f1) : !column_major_B, !elem_type_b linalg.fill(%cC, %f0) : !column_major_C, !elem_type_c + linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c %c0 = constant 0: index %c1 = constant 1: index @@ -84,18 +87,23 @@ %tmatmul_column_major = subf %t_end_matmul_column_major, %t_start_matmul_column_major: f64 call @print_perf(%iters, %tmatmul_column_major) : (index, f64) -> () - %res = load %cC[%c0, %c0]: !column_major_C - // CHECK: 64 - vector.print %res: !elem_type_c + // CHECK: SUCCESS + linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B) + outs(%cC_ref: !column_major_C) + %res = memref_cast %cC : !column_major_C to memref<*xf32> + %exp = memref_cast %cC_ref : !column_major_C to memref<*xf32> + call @verify_memref_f32(%res, %exp) : (memref<*xf32>, memref<*xf32>) -> () dealloc %cA : !column_major_A dealloc %cB : !column_major_B dealloc %cC : !column_major_C + dealloc %cC_ref : !column_major_C return } func private @rtclock() -> f64 +func private @verify_memref_f32(memref<*xf32>, memref<*xf32>) attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir @@ -1,7 +1,7 @@ // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ +// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \ // TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed. @@ -12,6 +12,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -63,14 +64,19 @@ func @main() { %f0 = constant 0.0 : !elem_type_c %f1 = constant 1.0 : !elem_type_a - + %cA = alloc() : !column_major_A %cB = alloc() : !column_major_B %cC = alloc() : !column_major_C + %cC_ref = alloc() : !column_major_C + %C_ref = alloc() : !row_major_C + linalg.fill(%cA, %f1) : !column_major_A, !elem_type_a linalg.fill(%cB, %f1) : !column_major_B, !elem_type_b linalg.fill(%cC, %f0) : !column_major_C, !elem_type_c + linalg.fill(%cC_ref, %f0) : !column_major_C, !elem_type_c + linalg.fill(%C_ref, %f0) : !row_major_C, !elem_type_c %c0 = constant 0: index %c1 = constant 1: index @@ -95,25 +101,34 @@ %tmatmul_column_major_as_row_major = subf %t_end_matmul_column_major_as_row_major, %t_start_matmul_column_major_as_row_major: f64 call @print_perf(%iters, %tmatmul_column_major_as_row_major) : (index, f64) -> () - %res = load %cC[%c0, %c0]: !column_major_C - // CHECK: 64 - vector.print %res: !elem_type_c - %res2 = load %C[%c0, %c0]: !row_major_C - // CHECK: 64 - vector.print %res2: !elem_type_c + // CHECK: SUCCESS + linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B) + outs(%cC_ref: !column_major_C) + %res1 = memref_cast %cC : !column_major_C to memref<*xf32> + %exp1 = memref_cast %cC_ref : !column_major_C to memref<*xf32> + call @verify_memref_f32(%res1, %exp1) : (memref<*xf32>, memref<*xf32>) -> () + // CHECK: SUCCESS + linalg.matmul ins(%A, %B : !row_major_A, !row_major_B) + outs(%C_ref: !row_major_C) + %res2 = memref_cast %C : !row_major_C to memref<*xf32> + %exp2 = memref_cast %C_ref : !row_major_C to memref<*xf32> + call @verify_memref_f32(%res2, %exp2) : (memref<*xf32>, memref<*xf32>) -> () dealloc %A : !row_major_A dealloc %B : !row_major_B dealloc %C : !row_major_C + dealloc %C_ref : !row_major_C dealloc %cA : !column_major_A dealloc %cB : !column_major_B dealloc %cC : !column_major_C + dealloc %cC_ref : !column_major_C return } func private @rtclock() -> f64 +func private @verify_memref_f32(memref<*xf32>, memref<*xf32>) attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir =================================================================== --- mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir +++ mlir/integration_test/Dialect/Linalg/CPU/benchmark_matmul_i8_i8_i32.mlir @@ -9,6 +9,7 @@ // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ // Activate to dump assembly // R_UN: -dump-object-file -object-filename=/tmp/a.o \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // Use tee to both print to stderr and FileCheck // RUN: tee -a /dev/stderr | FileCheck %s @@ -57,10 +58,12 @@ %A = alloc() : !row_major_A %B = alloc() : !row_major_B %C = alloc() : !row_major_C + %C_ref = alloc() : !row_major_C linalg.fill(%A, %v1) : !row_major_A, !elem_type_a linalg.fill(%B, %v1) : !row_major_B, !elem_type_b linalg.fill(%C, %v0) : !row_major_C, !elem_type_c + linalg.fill(%C_ref, %v0) : !row_major_C, !elem_type_c %c0 = constant 0: index %c1 = constant 1: index @@ -85,18 +88,23 @@ %tmatmul = subf %t_end_matmul, %t_start_matmul: f64 call @print_perf(%iters, %tmatmul) : (index, f64) -> () - %res = load %C[%c0, %c0]: !row_major_C - // CHECK: 64 - vector.print %res: !elem_type_c + // CHECK: SUCCESS + linalg.matmul_i8_i8_i32 ins(%A, %B : !row_major_A, !row_major_B) + outs(%C_ref: !row_major_C) + %res = memref_cast %C : !row_major_C to memref<*xi32> + %exp = memref_cast %C_ref : !row_major_C to memref<*xi32> + call @verify_memref_i32(%res, %exp) : (memref<*xi32>, memref<*xi32>) -> () dealloc %A : !row_major_A dealloc %B : !row_major_B dealloc %C : !row_major_C + dealloc %C_ref : !row_major_C return } func private @rtclock() -> f64 +func private @verify_memref_i32(memref<*xi32>, memref<*xi32>) attributes { llvm.emit_c_interface } // TODO: init with random, run and check output. // func private @fill_random_f32(memref<*xf32>) Index: mlir/lib/ExecutionEngine/RunnerUtils.cpp =================================================================== --- mlir/lib/ExecutionEngine/RunnerUtils.cpp +++ mlir/lib/ExecutionEngine/RunnerUtils.cpp @@ -80,3 +80,37 @@ _mlir_ciface_print_memref_4d_f32(StridedMemRefType *M) { impl::printMemRef(*M); } + +extern "C" void +_mlir_ciface_verify_memref_i32(UnrankedMemRefType *M1, + UnrankedMemRefType *M2) { + impl::verifyMemRef(*M1, *M2); +} + +extern "C" void _mlir_ciface_verify_memref_f32(UnrankedMemRefType *M1, + UnrankedMemRefType *M2) { + impl::verifyMemRef(*M1, *M2); +} + +extern "C" void _mlir_ciface_verify_memref_f64(UnrankedMemRefType *M1, + UnrankedMemRefType *M2) { + impl::verifyMemRef(*M1, *M2); +} + +extern "C" void verify_memref_i32(int64_t rank, void *ptr1, void *ptr2) { + UnrankedMemRefType descriptor1 = {rank, ptr1}; + UnrankedMemRefType descriptor2 = {rank, ptr2}; + _mlir_ciface_verify_memref_i32(&descriptor1, &descriptor2); +} + +extern "C" void verify_memref_f32(int64_t rank, void *ptr1, void *ptr2) { + UnrankedMemRefType descriptor1 = {rank, ptr1}; + UnrankedMemRefType descriptor2 = {rank, ptr2}; + _mlir_ciface_verify_memref_f32(&descriptor1, &descriptor2); +} + +extern "C" void verify_memref_f64(int64_t rank, void *ptr1, void *ptr2) { + UnrankedMemRefType descriptor1 = {rank, ptr1}; + UnrankedMemRefType descriptor2 = {rank, ptr2}; + _mlir_ciface_verify_memref_f64(&descriptor1, &descriptor2); +} \ No newline at end of file