diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -29,6 +29,7 @@ ${NVPTX_LIBS} LINK_LIBS PUBLIC + MLIRAsyncToLLVM MLIRGPU MLIRIR MLIRLLVMIR diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -16,6 +16,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "../PassDetail.h" +#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -293,10 +294,13 @@ void GpuToLLVMConversionPass::runOnOperation() { LLVMTypeConverter converter(&getContext()); OwningRewritePatternList patterns; + LLVMConversionTarget target(getContext()); + populateStdToLLVMConversionPatterns(converter, patterns); + populateAsyncStructuralTypeConversionsAndLegality(&getContext(), converter, + patterns, target); populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); - LLVMConversionTarget target(getContext()); if (failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); diff --git a/mlir/test/mlir-cuda-runner/async.mlir b/mlir/test/mlir-cuda-runner/async.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-cuda-runner/async.mlir @@ -0,0 +1,66 @@ +// RUN: mlir-cuda-runner %s --entry-point-result=void -O0 \ +// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_async_runtime%shlibext \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func @main() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %count = constant 2 : index + + // initialize h0 on host + %h0 = alloc(%count) : memref + %h0_unranked = memref_cast %h0 : memref to memref<*xi32> + gpu.host_register %h0_unranked : memref<*xi32> + + %v0 = constant 42 : i32 + store %v0, %h0[%c0] : memref + store %v0, %h0[%c1] : memref + + // copy h0 to b0 on device. + %t0, %f0 = async.execute () -> !async.value> { + %b0 = gpu.alloc(%count) : memref + gpu.memcpy %b0, %h0 : memref, memref + async.yield %b0 : memref + } + + // copy h0 to b1 and b2 (fork) + %t1, %f1 = async.execute [%t0] ( + %f0 as %b0 : !async.value> + ) -> !async.value> { + %b1 = gpu.alloc(%count) : memref + gpu.memcpy %b1, %b0 : memref, memref + async.yield %b1 : memref + } + %t2, %f2 = async.execute [%t0] ( + %f0 as %b0 : !async.value> + ) -> !async.value> { + %b2 = gpu.alloc(%count) : memref + gpu.memcpy %b2, %b0 : memref, memref + async.yield %b2 : memref + } + + // h0 = b1 + b2 (join). + %t3 = async.execute [%t1, %t2] ( + %f1 as %b1 : !async.value>, + %f2 as %b2 : !async.value> + ) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %c1, %block_z = %c1) { + %v1 = load %b1[%tx] : memref + %v2 = load %b2[%tx] : memref + %sum = addi %v1, %v2 : i32 + store %sum, %h0[%tx] : memref + gpu.terminator + } + async.yield + } + + async.await %t3 : !async.token + // CHECK: [84, 84] + call @print_memref_i32(%h0_unranked) : (memref<*xi32>) -> () + return +} + +func private @print_memref_i32(memref<*xi32>) diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -14,10 +14,12 @@ #include "llvm/ADT/STLExtras.h" +#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Dialect/Async/Passes.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -118,7 +120,14 @@ kernelPm.addPass(createConvertGPUKernelToBlobPass( translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", "sm_35", "+ptx60", gpuBinaryAnnotation)); + auto &funcPm = pm.nest(); + funcPm.addPass(createGpuAsyncRegionPass()); + funcPm.addPass(createAsyncRefCountingPass()); pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation)); + pm.addPass(createAsyncToAsyncRuntimePass()); + pm.addPass(createConvertAsyncToLLVMPass()); + mlir::LowerToLLVMOptions lower_to_llvm_opts; + pm.addPass(mlir::createLowerToLLVMPass(lower_to_llvm_opts)); return pm.run(m); }