diff --git a/mlir/integration_test/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir b/mlir/integration_test/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir new file mode 100644 --- /dev/null +++ b/mlir/integration_test/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir @@ -0,0 +1,128 @@ +// RUN: mlir-opt %s \ +// RUN: -linalg-tile-to-parallel-loops="linalg-tile-sizes=256" \ +// RUN: -async-parallel-for="num-concurrent-async-execute=4" \ +// RUN: -async-ref-counting \ +// RUN: -convert-async-to-llvm \ +// RUN: -lower-affine \ +// RUN: -convert-linalg-to-loops \ +// RUN: -convert-scf-to-std \ +// RUN: -std-expand \ +// RUN: -convert-vector-to-llvm \ +// RUN: -convert-std-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void -O3 \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext\ +// RUN: | FileCheck %s --dump-input=always --check-prefix=CHECK-ASYNC + +// RUN: mlir-opt %s \ +// RUN: -convert-linalg-to-loops \ +// RUN: -convert-scf-to-std \ +// RUN: -convert-vector-to-llvm \ +// RUN: -convert-std-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: -e entry -entry-point-result=void -O3 \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext\ +// RUN: | FileCheck %s --dump-input=always --check-prefix=CHECK-SEQ + +#map0 = affine_map<(d0, d1) -> (d0, d1)> + +func @linalg_generic(%lhs: memref, + %rhs: memref, + %sum: memref) { + linalg.generic { + indexing_maps = [#map0, #map0, #map0], + iterator_types = ["parallel", "parallel"] + } + ins(%lhs, %rhs : memref, memref) + outs(%sum : memref) + { + ^bb0(%lhs_in: f32, %rhs_in: f32, %sum_out: f32): + %0 = addf %lhs_in, %rhs_in : f32 + linalg.yield %0 : f32 + } + + return +} + +func @entry() { + %f1 = constant 1.0 : f32 + %f4 = constant 4.0 : f32 + %c0 = constant 0 : index + %c1 = constant 1 : index + %cM = constant 1000 : index + + // + // Sanity check for the function under test. + // + + %LHS10 = alloc() {alignment = 64} : memref<1x10xf32> + %RHS10 = alloc() {alignment = 64} : memref<1x10xf32> + %DST10 = alloc() {alignment = 64} : memref<1x10xf32> + + linalg.fill(%LHS10, %f1) : memref<1x10xf32>, f32 + linalg.fill(%RHS10, %f1) : memref<1x10xf32>, f32 + + %LHS = memref_cast %LHS10 : memref<1x10xf32> to memref + %RHS = memref_cast %RHS10 : memref<1x10xf32> to memref + %DST = memref_cast %DST10 : memref<1x10xf32> to memref + + call @linalg_generic(%LHS, %RHS, %DST) + : (memref, memref, memref) -> () + + // CHECK-ASYNC: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] + // CHECK-SEQ: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] + %U = memref_cast %DST10 : memref<1x10xf32> to memref<*xf32> + call @print_memref_f32(%U): (memref<*xf32>) -> () + + dealloc %LHS10: memref<1x10xf32> + dealloc %RHS10: memref<1x10xf32> + dealloc %DST10: memref<1x10xf32> + + // + // Allocate data for microbenchmarks. + // + + %LHS4096 = alloc() {alignment = 64} : memref<4096x4096xf32> + %RHS4096 = alloc() {alignment = 64} : memref<4096x4096xf32> + %DST4096 = alloc() {alignment = 64} : memref<4096x4096xf32> + + %LHS0 = memref_cast %LHS4096 : memref<4096x4096xf32> to memref + %RHS0 = memref_cast %RHS4096 : memref<4096x4096xf32> to memref + %DST0 = memref_cast %DST4096 : memref<4096x4096xf32> to memref + + // + // Warm up. + // + + call @linalg_generic(%LHS0, %RHS0, %DST0) + : (memref, memref, memref) -> () + + // + // Measure execution time. + // + + %t0 = call @rtclock() : () -> f64 + scf.for %i = %c0 to %cM step %c1 { + call @linalg_generic(%LHS0, %RHS0, %DST0) + : (memref, memref, memref) -> () + } + %t1 = call @rtclock() : () -> f64 + %t4096 = subf %t1, %t0 : f64 + + // Print timings. + vector.print %t4096 : f64 + + // Free. + dealloc %LHS4096: memref<4096x4096xf32> + dealloc %RHS4096: memref<4096x4096xf32> + dealloc %DST4096: memref<4096x4096xf32> + + return +} + +func private @rtclock() -> f64 + +func private @print_memref_f32(memref<*xf32>) + attributes { llvm.emit_c_interface } diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp --- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp +++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp @@ -592,7 +592,7 @@ CallOp call = cast(op); rewriter.replaceOpWithNewOp(op, resultTypes, call.callee(), - call.getOperands()); + operands); return success(); } @@ -733,7 +733,7 @@ // async API await function call. if (!isInCoroutine) rewriter.create(loc, TypeRange(), blockingAwaitFuncName, - ValueRange(op->getOperand(0))); + ValueRange(operands[0])); // Inside the coroutine we convert await operation into coroutine suspension // point, and resume execution asynchronously. @@ -755,8 +755,8 @@ // Call async runtime API to resume a coroutine in the managed thread when // the async await argument becomes ready. - SmallVector awaitAndExecuteArgs = { - await.getOperand(), coro.coroHandle, resumePtr.res()}; + SmallVector awaitAndExecuteArgs = {operands[0], coro.coroHandle, + resumePtr.res()}; builder.create(loc, TypeRange(), coroAwaitFuncName, awaitAndExecuteArgs); diff --git a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp --- a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp +++ b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp @@ -24,6 +24,8 @@ #include #include +#include "llvm/Support/ThreadPool.h" + //===----------------------------------------------------------------------===// // Async runtime API. //===----------------------------------------------------------------------===// @@ -229,8 +231,8 @@ extern "C" void mlirAsyncRuntimeExecute(CoroHandle handle, CoroResume resume) { #if LLVM_ENABLE_THREADS - std::thread thread([handle, resume]() { (*resume)(handle); }); - thread.detach(); + static llvm::ThreadPool *threadPool = new llvm::ThreadPool(); + threadPool->async([handle, resume]() { (*resume)(handle); }); #else (*resume)(handle); #endif