diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -23,9 +23,18 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS) set(INTEL_SDE_EXECUTABLE "" CACHE STRING "If set, arch-specific integration tests are run with Intel SDE.") + set(ARM_EMULATOR_EXECUTABLE "" CACHE STRING + "If set, arch-specific Arm integration tests are run with an emulator.") + set(ARM_EMULATOR_OPTIONS "" CACHE STRING + "If arch-specific Arm integration tests run emulated, pass these as parameters to the emulator.") + set(ARM_EMULATOR_LLI_EXECUTABLE "" CACHE STRING + "If arch-specific Arm integration tests run emulated, use this Arm native lli.") + set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING + "If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.") option(MLIR_RUN_AMX_TESTS "Run AMX tests.") option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.") option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.") + option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.") # Passed to lit.site.cfg.py.in to set up the path where to find the libraries. set(MLIR_INTEGRATION_TEST_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/lit.local.cfg b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/lit.local.cfg new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/lit.local.cfg @@ -0,0 +1,29 @@ +import sys + +# ArmSVE tests must be enabled via build flag. +if config.mlir_run_arm_sve_tests != 'ON': + config.unsupported = True + +# No JIT on win32. +if sys.platform == 'win32': + config.unsupported = True + +lli_cmd = 'lli' +if config.arm_emulator_lli_executable: + lli_cmd = config.arm_emulator_lli_executable + +if config.arm_emulator_utils_lib_dir: + config.substitutions.append(('%mlir_native_utils_lib_dir', config.arm_emulator_utils_lib_dir)) +else: + config.substitutions.append(('%mlir_native_utils_lib_dir', config.mlir_integration_test_dir)) +config.substitutions.append(('%mlir_native_utils_lib_dir', config.mlir_integration_test_dir)) + +if config.arm_emulator_executable: + # Run test in emulator (qemu or armie) + emulation_cmd = config.arm_emulator_executable + if config.arm_emulator_options: + emulation_cmd = emulation_cmd + ' ' + config.arm_emulator_options + emulation_cmd = emulation_cmd + ' ' + lli_cmd + config.substitutions.append(('%lli', emulation_cmd)) +else: + config.substitutions.append(('%lli', lli_cmd)) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-sve.mlir @@ -0,0 +1,154 @@ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm="enable-arm-sve" -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-translate -mlir-to-llvmir | \ +// RUN: %lli --entry-function=entry --march=aarch64 --mattr="+sve" --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// Note: To run this test, your CPU must support SVE + +// SVE-based memcopy +func @kernel_copy(%src : memref, %dst : memref, %size : index) { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %vs = vector.vscale + %step = arith.muli %c2, %vs : index + scf.for %i0 = %c0 to %size step %step { + %0 = vector.load %src[%i0] : memref, vector<[2]xi64> + vector.store %0, %dst[%i0] : memref, vector<[2]xi64> + } + + return +} + +// SVE-based multiply and add +func @kernel_muladd(%a : memref, + %b : memref, + %c : memref, + %size : index) { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %vs = vector.vscale + %step = arith.muli %c2, %vs : index + scf.for %i0 = %c0 to %size step %step { + %0 = vector.load %a[%i0] : memref, vector<[2]xi64> + %1 = vector.load %b[%i0] : memref, vector<[2]xi64> + %2 = vector.load %c[%i0] : memref, vector<[2]xi64> + %3 = arith.muli %0, %1 : vector<[2]xi64> + %4 = arith.addi %3, %2 : vector<[2]xi64> + vector.store %4, %c[%i0] : memref, vector<[2]xi64> + } + return +} + +// SVE-based absolute difference +func @kernel_absdiff(%a : memref, + %b : memref, + %c : memref, + %size : index) { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %vs = vector.vscale + %step = arith.muli %c2, %vs : index + scf.for %i0 = %c0 to %size step %step { + %0 = vector.load %a[%i0] : memref, vector<[2]xi64> + %1 = vector.load %b[%i0] : memref, vector<[2]xi64> + %agb = arith.cmpi sge, %0, %1 : vector<[2]xi64> + %bga = arith.cmpi slt, %0, %1 : vector<[2]xi64> + %10 = arm_sve.masked.subi %agb, %0, %1 : vector<[2]xi1>, + vector<[2]xi64> + %01 = arm_sve.masked.subi %bga, %1, %0 : vector<[2]xi1>, + vector<[2]xi64> + vector.maskedstore %c[%i0], %agb, %10 : memref, + vector<[2]xi1>, + vector<[2]xi64> + vector.maskedstore %c[%i0], %bga, %01 : memref, + vector<[2]xi1>, + vector<[2]xi64> + } + return +} + +func @entry() -> i32 { + %i0 = arith.constant 0: i64 + %i1 = arith.constant 1: i64 + %r0 = arith.constant 0: i32 + %c0 = arith.constant 0: index + %c1 = arith.constant 1: index + %c2 = arith.constant 2: index + %c4 = arith.constant 4: index + %c8 = arith.constant 8: index + %c32 = arith.constant 32: index + + // Set up memory. + %a = memref.alloc() : memref<32xi64> + %a_copy = memref.alloc() : memref<32xi64> + %b = memref.alloc() : memref<32xi64> + %c = memref.alloc() : memref<32xi64> + %d = memref.alloc() : memref<32xi64> + + %a_data = arith.constant dense<[1 , 2, 3 , 4 , 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32]> : vector<32xi64> + vector.transfer_write %a_data, %a[%c0] : vector<32xi64>, memref<32xi64> + %b_data = arith.constant dense<[33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64]> : vector<32xi64> + vector.transfer_write %b_data, %b[%c0] : vector<32xi64>, memref<32xi64> + %d_data = arith.constant dense<[-9, 76, -7, 78, -5, 80, -3, 82, + -1, 84, 1, 86, 3, 88, 5, 90, + 7, 92, 9, 94, 11, 96, 13, 98, + 15, 100, 17, 102, 19, 104, 21, 106]> : vector<32xi64> + vector.transfer_write %d_data, %d[%c0] : vector<32xi64>, memref<32xi64> + %zero_data = vector.broadcast %i0 : i64 to vector<32xi64> + vector.transfer_write %zero_data, %a_copy[%c0] : vector<32xi64>, memref<32xi64> + %one_data = vector.broadcast %i1 : i64 to vector<32xi64> + vector.transfer_write %one_data, %c[%c0] : vector<32xi64>, memref<32xi64> + + + // Call kernel. + %0 = memref.cast %a : memref<32xi64> to memref + %1 = memref.cast %a_copy : memref<32xi64> to memref + call @kernel_copy(%0, %1, %c32) : (memref, memref, index) -> () + + // Print and verify. + // + // CHECK: ( 1, 2, 3, 4 ) + // CHECK-NEXT: ( 5, 6, 7, 8 ) + scf.for %i = %c0 to %c32 step %c4 { + %cv = vector.transfer_read %a_copy[%i], %i0: memref<32xi64>, vector<4xi64> + vector.print %cv : vector<4xi64> + } + + %2 = memref.cast %a : memref<32xi64> to memref + %3 = memref.cast %b : memref<32xi64> to memref + %4 = memref.cast %c : memref<32xi64> to memref + call @kernel_muladd(%2, %3, %4, %c32) : (memref, memref, memref, index) -> () + + // CHECK: ( 34, 69, 106, 145 ) + // CHECK-NEXT: ( 186, 229, 274, 321 ) + scf.for %i = %c0 to %c32 step %c4 { + %macv = vector.transfer_read %c[%i], %i0: memref<32xi64>, vector<4xi64> + vector.print %macv : vector<4xi64> + } + + %5 = memref.cast %b : memref<32xi64> to memref + %6 = memref.cast %d : memref<32xi64> to memref + %7 = memref.cast %c : memref<32xi64> to memref + call @kernel_absdiff(%5, %6, %7, %c32) : (memref, memref, memref, index) -> () + + // CHECK: ( 42, 42, 42, 42 ) + // CHECK-NEXT: ( 42, 42, 42, 42 ) + scf.for %i = %c0 to %c32 step %c4 { + %abdv = vector.transfer_read %c[%i], %i0: memref<32xi64>, vector<4xi64> + vector.print %abdv : vector<4xi64> + } + // Release resources. + memref.dealloc %a : memref<32xi64> + memref.dealloc %a_copy : memref<32xi64> + memref.dealloc %b : memref<32xi64> + memref.dealloc %c : memref<32xi64> + memref.dealloc %d : memref<32xi64> + + return %r0 : i32 +} diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -54,6 +54,11 @@ config.mlir_run_x86vector_tests = @MLIR_RUN_X86VECTOR_TESTS@ config.mlir_run_cuda_tensor_core_tests = @MLIR_RUN_CUDA_TENSOR_CORE_TESTS@ config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@ +config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@" +config.arm_emulator_options = "@ARM_EMULATOR_OPTIONS@" +config.arm_emulator_lli_executable = "@ARM_EMULATOR_LLI_EXECUTABLE@" +config.arm_emulator_utils_lib_dir = "@ARM_EMULATOR_UTILS_LIB_DIR@" +config.mlir_run_arm_sve_tests = "@MLIR_RUN_ARM_SVE_TESTS@" # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time.