diff --git a/mlir/include/mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h b/mlir/include/mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h --- a/mlir/include/mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h +++ b/mlir/include/mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h @@ -13,13 +13,29 @@ namespace mlir { +class ConversionTarget; class ModuleOp; template class OperationPass; +class MLIRContext; +class OwningRewritePatternList; +class TypeConverter; /// Create a pass to convert Async operations to the LLVM dialect. std::unique_ptr> createConvertAsyncToLLVMPass(); +/// Populates patterns for async structural type conversions. +/// +/// A "structural" type conversion is one where the underlying ops are +/// completely agnostic to the actual types involved and simply need to update +/// their types. An example of this is async.execute -- the async.execute op and +/// the corresponding async.yield ops need to update their types accordingly to +/// the TypeConverter, but otherwise don't care what type conversions are +/// happening. +void populateAsyncStructuralTypeConversionsAndLegality( + MLIRContext *context, TypeConverter &typeConverter, + OwningRewritePatternList &patterns, ConversionTarget &target); + } // namespace mlir #endif // MLIR_CONVERSION_ASYNCTOLLVM_ASYNCTOLLVM_H diff --git a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h --- a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h +++ b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h @@ -26,7 +26,7 @@ #define MLIR_CRUNNERUTILS_EXPORT __declspec(dllimport) #endif // mlir_c_runner_utils_EXPORTS #endif // MLIR_CRUNNERUTILS_EXPORT -#else +#else // _WIN32 #define MLIR_CRUNNERUTILS_EXPORT #define MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS #endif // _WIN32 diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp --- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp +++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp @@ -1136,6 +1136,72 @@ } } // namespace +namespace { +class ConvertExecuteOpTypes : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(ExecuteOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + ExecuteOp newOp = + cast(rewriter.cloneWithoutRegions(*op.getOperation())); + rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(), + newOp.getRegion().end()); + + // Set operands and update block argument and result types. + newOp->setOperands(operands); + if (failed(rewriter.convertRegionTypes(&newOp.getRegion(), *typeConverter))) + return failure(); + for (auto result : newOp.getResults()) + result.setType(typeConverter->convertType(result.getType())); + + rewriter.replaceOp(op, newOp.getResults()); + return success(); + } +}; + +// Dummy pattern to trigger the appropriate type conversion / materialization. +class ConvertAwaitOpTypes : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(AwaitOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, operands.front()); + // rewriter.updateRootInPlace(op, [] {}); + return success(); + } +}; + +// Dummy pattern to trigger the appropriate type conversion / materialization. +class ConvertYieldOpTypes : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(async::YieldOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, operands); + return success(); + } +}; +} // namespace + std::unique_ptr> mlir::createConvertAsyncToLLVMPass() { return std::make_unique(); } + +void mlir::populateAsyncStructuralTypeConversionsAndLegality( + MLIRContext *context, TypeConverter &typeConverter, + OwningRewritePatternList &patterns, ConversionTarget &target) { + typeConverter.addConversion([&](TokenType type) { return type; }); + typeConverter.addConversion([&](ValueType type) { + return ValueType::get(typeConverter.convertType(type.getValueType())); + }); + + patterns + .insert( + typeConverter, context); + + target.addDynamicallyLegalOp( + [&](Operation *op) { return typeConverter.isLegal(op); }); +} diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -29,6 +29,7 @@ ${NVPTX_LIBS} LINK_LIBS PUBLIC + MLIRAsyncToLLVM MLIRGPU MLIRIR MLIRLLVMIR diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -16,6 +16,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "../PassDetail.h" +#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -293,10 +294,13 @@ void GpuToLLVMConversionPass::runOnOperation() { LLVMTypeConverter converter(&getContext()); OwningRewritePatternList patterns; + LLVMConversionTarget target(getContext()); + populateStdToLLVMConversionPatterns(converter, patterns); + populateAsyncStructuralTypeConversionsAndLegality(&getContext(), converter, + patterns, target); populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); - LLVMConversionTarget target(getContext()); if (failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); @@ -373,19 +377,19 @@ return failure(); auto loc = allocOp.getLoc(); + auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary()); // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector shape; SmallVector strides; Value sizeBytes; - getMemRefDescriptorSizes(loc, memRefType, operands, rewriter, shape, strides, - sizeBytes); + getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter, + shape, strides, sizeBytes); // Allocate the underlying buffer and store a pointer to it in the MemRef // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); - auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary()); auto stream = adaptor.asyncDependencies().front(); Value allocatedPtr = allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0); diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -80,6 +80,7 @@ EXCLUDE_FROM_LIBMLIR ) set_property(TARGET mlir_c_runner_utils PROPERTY CXX_STANDARD 11) +target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS) add_mlir_library(mlir_c_runner_utils_static CRunnerUtils.cpp @@ -88,7 +89,6 @@ EXCLUDE_FROM_LIBMLIR ) set_property(TARGET mlir_c_runner_utils_static PROPERTY CXX_STANDARD 11) -target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS) add_mlir_library(mlir_runner_utils SHARED @@ -101,6 +101,15 @@ ) target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS) +add_mlir_library(mlir_runner_utils_static + RunnerUtils.cpp + + EXCLUDE_FROM_LIBMLIR + + LINK_LIBS PUBLIC + mlir_c_runner_utils_static +) + add_mlir_library(mlir_async_runtime SHARED AsyncRuntime.cpp @@ -112,3 +121,13 @@ ${LLVM_PTHREAD_LIB} ) target_compile_definitions(mlir_async_runtime PRIVATE mlir_async_runtime_EXPORTS) + +add_mlir_library(mlir_async_runtime_static + AsyncRuntime.cpp + + EXCLUDE_FROM_LIBMLIR + + LINK_LIBS PUBLIC + mlir_c_runner_utils_static + ${LLVM_PTHREAD_LIB} +) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @main() { %data = alloc() : memref<2x6xi32> diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @main() { %data = alloc() : memref<2x6xi32> diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @main() { %data = alloc() : memref<2x6xi32> diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s // CHECK-COUNT-8: [{{(5356, ){12}5356}}] func @main() { diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @main() { %data = alloc() : memref<2x6xi32> diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s // CHECK: [{{(35, ){34}35}}] func @main() { diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @main() { %data = alloc() : memref<2x6xi32> diff --git a/mlir/test/mlir-cuda-runner/async.mlir b/mlir/test/mlir-cuda-runner/async.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-cuda-runner/async.mlir @@ -0,0 +1,65 @@ +// RUN: mlir-cuda-runner %s --entry-point-result=void -O0 -g \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %count = constant 2 : index + + // initialize h0 on host + %h0 = alloc(%count) : memref + %h0_unranked = memref_cast %h0 : memref to memref<*xi32> + gpu.host_register %h0_unranked : memref<*xi32> + + %v0 = constant 42 : i32 + store %v0, %h0[%c0] : memref + store %v0, %h0[%c1] : memref + + // copy h0 to b0 on device. + %t0, %f0 = async.execute () -> !async.value> { + %b0 = gpu.alloc(%count) : memref + gpu.memcpy %b0, %h0 : memref, memref + async.yield %b0 : memref + } + + // copy h0 to b1 and b2 (fork) + %t1, %f1 = async.execute [%t0] ( + %f0 as %b0 : !async.value> + ) -> !async.value> { + %b1 = gpu.alloc(%count) : memref + gpu.memcpy %b1, %b0 : memref, memref + async.yield %b1 : memref + } + %t2, %f2 = async.execute [%t0] ( + %f0 as %b0 : !async.value> + ) -> !async.value> { + %b2 = gpu.alloc(%count) : memref + gpu.memcpy %b2, %b0 : memref, memref + async.yield %b2 : memref + } + + // h0 = b1 + b2 (join). + %t3 = async.execute [%t1, %t2] ( + %f1 as %b1 : !async.value>, + %f2 as %b2 : !async.value> + ) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %c1, %block_z = %c1) { + %v1 = load %b1[%tx] : memref + %v2 = load %b2[%tx] : memref + %sum = addi %v1, %v2 : i32 + store %sum, %h0[%tx] : memref + gpu.terminator + } + async.yield + } + + async.await %t3 : !async.token + // CHECK: [84, 84] + call @print_memref_i32(%h0_unranked) : (memref<*xi32>) -> () + return +} + +func private @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir --- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir +++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @other_func(%arg0 : f32, %arg1 : memref) { %cst = constant 1 : index diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir --- a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir +++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @main() { %data = alloc() : memref<2x6xf32> diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir --- a/mlir/test/mlir-cuda-runner/shuffle.mlir +++ b/mlir/test/mlir-cuda-runner/shuffle.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s // CHECK: [4, 5, 6, 7, 0, 1, 2, 3, 12, -1, -1, -1, 8] func @main() { diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/mlir-cuda-runner/two-modules.mlir --- a/mlir/test/mlir-cuda-runner/two-modules.mlir +++ b/mlir/test/mlir-cuda-runner/two-modules.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-cuda-runner %s \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s // CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] func @main() { diff --git a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir --- a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir +++ b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-rocm-runner %s \ +// RUN: --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @other_func(%arg0 : f32, %arg1 : memref) { %c0 = constant 0 : index diff --git a/mlir/test/mlir-rocm-runner/two-modules.mlir b/mlir/test/mlir-rocm-runner/two-modules.mlir --- a/mlir/test/mlir-rocm-runner/two-modules.mlir +++ b/mlir/test/mlir-rocm-runner/two-modules.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-rocm-runner %s \ +// RUN: --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s // CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] func @main() { diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/mlir-rocm-runner/vecadd.mlir --- a/mlir/test/mlir-rocm-runner/vecadd.mlir +++ b/mlir/test/mlir-rocm-runner/vecadd.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-rocm-runner %s \ +// RUN: --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @vecadd(%arg0 : memref, %arg1 : memref, %arg2 : memref) { %c0 = constant 0 : index diff --git a/mlir/test/mlir-rocm-runner/vector-transferops.mlir b/mlir/test/mlir-rocm-runner/vector-transferops.mlir --- a/mlir/test/mlir-rocm-runner/vector-transferops.mlir +++ b/mlir/test/mlir-rocm-runner/vector-transferops.mlir @@ -1,4 +1,7 @@ -// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s +// RUN: mlir-rocm-runner %s \ +// RUN: --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s func @vectransferx2(%arg0 : memref, %arg1 : memref) { %cst = constant 1 : index diff --git a/mlir/tools/mlir-cuda-runner/CMakeLists.txt b/mlir/tools/mlir-cuda-runner/CMakeLists.txt --- a/mlir/tools/mlir-cuda-runner/CMakeLists.txt +++ b/mlir/tools/mlir-cuda-runner/CMakeLists.txt @@ -37,6 +37,8 @@ target_link_libraries(cuda-runtime-wrappers PUBLIC LLVMSupport + mlir_runner_utils_static + mlir_async_runtime_static ${CUDA_RUNTIME_LIBRARY} ) diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -14,10 +14,12 @@ #include "llvm/ADT/STLExtras.h" +#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Dialect/Async/Passes.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -117,7 +119,13 @@ kernelPm.addPass(createConvertGPUKernelToBlobPass( translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", "sm_35", "+ptx60", gpuBinaryAnnotation)); + auto &funcPm = pm.nest(); + funcPm.addPass(createGpuAsyncRegionPass()); + funcPm.addPass(createAsyncRefCountingPass()); pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation)); + pm.addPass(createConvertAsyncToLLVMPass()); + mlir::LowerToLLVMOptions lower_to_llvm_opts; + pm.addPass(mlir::createLowerToLLVMPass(lower_to_llvm_opts)); return pm.run(m); } diff --git a/mlir/tools/mlir-rocm-runner/CMakeLists.txt b/mlir/tools/mlir-rocm-runner/CMakeLists.txt --- a/mlir/tools/mlir-rocm-runner/CMakeLists.txt +++ b/mlir/tools/mlir-rocm-runner/CMakeLists.txt @@ -61,6 +61,7 @@ target_link_libraries(rocm-runtime-wrappers PUBLIC LLVMSupport + mlir_runner_utils_static ${ROCM_RUNTIME_LIBRARY} )