diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -76,24 +76,28 @@ } /// Constructs code to launch GPU kernel. -static void genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc, - SmallVectorImpl &args, - unsigned numThreads) { +static Value genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc, + SmallVectorImpl &args, + SmallVectorImpl &tokens, + unsigned numThreads) { Location loc = gpuFunc->getLoc(); Value none = TypedValue<::mlir::IntegerType>{}; Value one = constantIndex(builder, loc, 1); Value numT = constantIndex(builder, loc, numThreads); gpu::KernelDim3 gridSize = {one, one, one}; gpu::KernelDim3 blckSize = {numT, one, one}; - builder.create(loc, gpuFunc, gridSize, blckSize, - /*dynSharedMemSz*/ none, args); + return builder + .create(loc, gpuFunc, gridSize, blckSize, + /*dynSharedMemSz*/ none, args, + builder.getType(), tokens) + .getAsyncToken(); } /// Maps the provided ranked host buffer into the device address space. /// Writes from the host are guaranteed to be visible to device kernels /// that are launched afterwards. Writes from the device are guaranteed /// to be visible on the host after synchronizing with the device kernel -/// completion. +/// completion. Needs to cast the buffer to a unranked buffer. static Value genHostRegisterMemref(OpBuilder &builder, Location loc, Value mem) { MemRefType memTp = mem.getType().cast(); @@ -101,7 +105,122 @@ UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0); Value cast = builder.create(loc, resTp, mem); builder.create(loc, cast); - return mem; // convenience pass-through + return cast; +} + +/// Unmaps the provided buffer, expecting the casted buffer. +static void genHostUnregisterMemref(OpBuilder &builder, Location loc, + Value cast) { + builder.create(loc, cast); +} + +/// Generates first wait in an asynchronous chain. +static Value genFirstWait(OpBuilder &builder, Location loc) { + Type tokenType = builder.getType(); + return builder.create(loc, tokenType, ValueRange()) + .getAsyncToken(); +} + +/// Generates last, blocking wait in an asynchronous chain. +static void genBlockingWait(OpBuilder &builder, Location loc, + ValueRange operands) { + builder.create(loc, Type(), operands); +} + +/// Allocates memory on the device. +/// TODO: A `host_shared` attribute could be used to indicate that the +/// buffer is visible by both host and device, but that lowering +/// that feature does not seem to be fully supported yet. +static gpu::AllocOp genAllocMemRef(OpBuilder &builder, Location loc, Value mem, + Value token) { + auto tp = mem.getType().cast(); + auto elemTp = tp.getElementType(); + auto shape = tp.getShape(); + auto memTp = MemRefType::get(shape, elemTp); + SmallVector dynamicSizes; + for (unsigned r = 0, rank = tp.getRank(); r < rank; r++) { + if (shape[r] == ShapedType::kDynamic) { + Value dim = constantIndex(builder, loc, r); + Value dimOp = builder.create(loc, mem, dim); + dynamicSizes.push_back(dimOp); + } + } + return builder.create(loc, TypeRange({memTp, token.getType()}), + token, dynamicSizes, ValueRange()); +} + +/// Deallocates memory from the device. +static Value genDeallocMemRef(OpBuilder &builder, Location loc, Value mem, + Value token) { + return builder.create(loc, token.getType(), token, mem) + .getAsyncToken(); +} + +/// Copies memory between host and device (direction is implicit). +static Value genCopyMemRef(OpBuilder &builder, Location loc, Value dst, + Value src, Value token) { + return builder.create(loc, token.getType(), token, dst, src) + .getAsyncToken(); +} + +/// Prepares the outlined arguments, passing scalars and buffers in. Here we +/// assume that the first buffer is the one allocated for output. We create +/// a set of properly chained asynchronous allocation/copy pairs to increase +/// overlap before launching the kernel. +/// TODO: the output assumption may be a bit too brittle +static Value genParametersIn(OpBuilder &builder, Location loc, + SmallVectorImpl &scalars, + SmallVectorImpl &buffers, + SmallVectorImpl &args, + SmallVectorImpl &tokens, + bool useHostRegistrationForOut) { + Value out; + // Scalars are passed by value. + for (Value s : scalars) + args.push_back(s); + // Buffers are need to be made visible on device. + for (Value b : buffers) { + if (useHostRegistrationForOut) { + out = genHostRegisterMemref(builder, loc, b); + args.push_back(b); + useHostRegistrationForOut = false; + continue; + } + Value firstToken = genFirstWait(builder, loc); + auto alloc = genAllocMemRef(builder, loc, b, firstToken); + Value devMem = alloc.getResult(0); + Value depToken = alloc.getAsyncToken(); // copy-after-alloc + args.push_back(devMem); + tokens.push_back(genCopyMemRef(builder, loc, devMem, b, depToken)); + } + return out; +} + +/// Finalizes the outlined arguments. The output buffer is copied depending +/// on the kernel token and then deallocated. All other buffers are simply +/// deallocated. Then we wait for all operations to complete. +static void genParametersOut(OpBuilder &builder, Location loc, Value out, + Value kernelToken, SmallVectorImpl &scalars, + SmallVectorImpl &buffers, + SmallVectorImpl &args, + SmallVectorImpl &tokens) { + unsigned base = scalars.size(); + for (unsigned i = base, e = args.size(); i < e; i++) { + Value firstToken; + if (i == base) { + // Assumed output parameter: unregister or copy-out. + if (out) { + genHostUnregisterMemref(builder, loc, out); + out = Value(); + continue; + } + firstToken = + genCopyMemRef(builder, loc, buffers[0], args[i], kernelToken); + } else { + firstToken = genFirstWait(builder, loc); + } + tokens.push_back(genDeallocMemRef(builder, loc, args[i], firstToken)); + } } /// Constructs code for new GPU kernel. @@ -158,10 +277,8 @@ /// Proof-of-concept rewriter. This rule generates a CUDA implementation /// for each outermost forall loop generated by the sparse compiler. -// -// TODO: right works with parallelization-strategy=dense-outer-loop -// but give this its own flags in the future -// +/// TODO: right works with parallelization-strategy=dense-outer-loop +/// but give this its own flags in the future struct ForallRewriter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -211,22 +328,31 @@ else return failure(); // don't know how to share } - // Prepare the outlined arguments, register buffers. + // Pass outlined non-constant values. Location loc = forallOp->getLoc(); SmallVector args; - for (Value s : scalars) - args.push_back(s); - for (Value b : buffers) - args.push_back(genHostRegisterMemref(rewriter, loc, b)); - auto saveIp = rewriter.saveInsertionPoint(); + SmallVector tokens; + Value out = genParametersIn(rewriter, loc, scalars, buffers, args, tokens, + /*useHostRegistrationForOut=*/false); // Set up GPU module and construct GPU function. + auto saveIp = rewriter.saveInsertionPoint(); ModuleOp topModule = forallOp->getParentOfType(); auto gpuModule = genGPUModule(rewriter, topModule); auto gpuFunc = genGPUFunc(rewriter, gpuModule, args); genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers); - // Generate code that launches the kernel. + // Generate code that launches the kernel asynchronously, blocking on all + // opens tokens and yielding a new token for the output. + // TODO: Passing in tokens to launch up does not seem to be properly lowered + // by cubin yet, hence the current blocking wait. rewriter.restoreInsertionPoint(saveIp); - genLaunchGPUFunc(rewriter, gpuFunc, args, numThreads); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + Value kernelToken = + genLaunchGPUFunc(rewriter, gpuFunc, args, tokens, numThreads); + // Finalize the outlined arguments. + genParametersOut(rewriter, loc, out, kernelToken, scalars, buffers, args, + tokens); + genBlockingWait(rewriter, loc, tokens); rewriter.eraseOp(forallOp); return success(); } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir @@ -7,12 +7,46 @@ // // CHECK-LABEL: gpu.module @sparse_kernels -// CHECK-DAG: gpu.func @kernel0 -// CHECK-DAG: gpu.func @kernel1 +// CHECK: gpu.func @kernel1 +// CHECK: gpu.func @kernel0 // // CHECK-LABEL: func.func @matmuls -// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel0 blocks -// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel1 blocks +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel1 blocks +// CHECK: gpu.memcpy async [%[[T1]]] +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.wait +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: gpu.alloc async +// CHECK: gpu.memcpy async +// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks +// CHECK: gpu.memcpy async [%[[T0]]] +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.dealloc async +// CHECK: gpu.wait // func.func @matmuls(%A: tensor<1024x8xf64>, %B: tensor<8x1024xf64, #CSR>, diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir @@ -47,12 +47,34 @@ // // // CHECK-LABEL: func.func @matmul -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S0:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S1:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S2:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S3:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S4:.*]] = gpu.memcpy async +// CHECK: gpu.wait [%[[S0]], %[[S1]], %[[S2]], %[[S3]], %[[S4]] +// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks +// CHECK: %[[M0:.*]] = gpu.memcpy async [%[[T0]]] +// CHECK: %[[M1:.*]] = gpu.dealloc async [%[[M0]]] +// CHECK: %[[M2:.*]] = gpu.wait async +// CHECK: %[[M3:.*]] = gpu.dealloc async [%[[M2]]] +// CHECK: %[[M4:.*]] = gpu.wait async +// CHECK: %[[M5:.*]] = gpu.dealloc async [%[[M4]]] +// CHECK: %[[M6:.*]] = gpu.wait async +// CHECK: %[[M7:.*]] = gpu.dealloc async [%[[M6]]] +// CHECK: %[[M8:.*]] = gpu.wait async +// CHECK: %[[M9:.*]] = gpu.dealloc async [%[[M8]]] +// CHECK: gpu.wait [%[[M1]], %[[M3]], %[[M5]], %[[M7]], %[[M9]] // func.func @matmul(%A: tensor, %B: tensor, %C_in: tensor) -> tensor { %C_out = linalg.matmul diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir @@ -43,12 +43,34 @@ // CHECK: } // // CHECK-LABEL: func.func @matvec -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.host_register -// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S0:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S1:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S2:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S3:.*]] = gpu.memcpy async +// CHECK: gpu.wait async +// CHECK: gpu.alloc async +// CHECK: %[[S4:.*]] = gpu.memcpy async +// CHECK: gpu.wait [%[[S0]], %[[S1]], %[[S2]], %[[S3]], %[[S4]] +// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks +// CHECK: %[[M0:.*]] = gpu.memcpy async [%[[T0]]] +// CHECK: %[[M1:.*]] = gpu.dealloc async [%[[M0]]] +// CHECK: %[[M2:.*]] = gpu.wait async +// CHECK: %[[M3:.*]] = gpu.dealloc async [%[[M2]]] +// CHECK: %[[M4:.*]] = gpu.wait async +// CHECK: %[[M5:.*]] = gpu.dealloc async [%[[M4]]] +// CHECK: %[[M6:.*]] = gpu.wait async +// CHECK: %[[M7:.*]] = gpu.dealloc async [%[[M6]]] +// CHECK: %[[M8:.*]] = gpu.wait async +// CHECK: %[[M9:.*]] = gpu.dealloc async [%[[M8]]] +// CHECK: gpu.wait [%[[M1]], %[[M3]], %[[M5]], %[[M7]], %[[M9]] // func.func @matvec(%A: tensor, %x: tensor, %y_in: tensor) -> tensor { %y_out = linalg.matvec diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir @@ -0,0 +1,65 @@ +// +// NOTE: this test requires gpu-sm80 +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --e main --entry-point-result=void \ +// RUN: | FileCheck %s + +#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }> + +module { + // Compute matrix vector y = Ax + func.func @matvec(%A: tensor<1024x64xf64, #CSR>, %x: tensor<64xf64>, %y_in: tensor<1024xf64>) -> tensor<1024xf64> { + %y_out = linalg.matvec + ins(%A, %x: tensor<1024x64xf64, #CSR>, tensor<64xf64>) + outs(%y_in: tensor<1024xf64>) -> tensor<1024xf64> + return %y_out : tensor<1024xf64> + } + + memref.global "private" constant @__constant_64xf64 : memref<64xf64> = dense<1.000000e+00> {alignment = 64 : i64} + + func.func @main() { + %f0 = arith.constant 0.0 : f64 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // Stress test with a dense matrix DA. + %DA = tensor.generate { + ^bb0(%i: index, %j: index): + %k = arith.addi %i, %j : index + %l = arith.index_cast %k : index to i64 + %f = arith.uitofp %l : i64 to f64 + tensor.yield %f : f64 + } : tensor<1024x64xf64> + + // Convert to a "sparse" 1024 x 64 matrix A. + %A = sparse_tensor.convert %DA : tensor<1024x64xf64> to tensor<1024x64xf64, #CSR> + + // Initialize dense vector to 1024 zeros. + %y = tensor.generate { + ^bb0(%i : index): + tensor.yield %f0 : f64 + } : tensor<1024xf64> + + // Call the kernel with an vector taken from global memory. + %xbuf = memref.get_global @__constant_64xf64 : memref<64xf64> + %x = bufferization.to_tensor %xbuf restrict : memref<64xf64> + %0 = call @matvec(%A, %x, %y) : (tensor<1024x64xf64, #CSR>, tensor<64xf64>, tensor<1024xf64>) -> tensor<1024xf64> + + // + // Sanity check on results. + // + // CHECK: ( 2016, 2080, 2144, 2208, 2272, 2336, 2400, 2464, 2528, 2592, 2656, 2720, 2784, 2848, 2912, 2976, 3040, 3104, 3168, 3232, 3296, 3360, 3424, 3488, 3552, 3616, 3680, 3744, 3808, 3872, 3936, 4000, 4064, 4128, 4192, 4256, 4320, 4384, 4448, 4512, 4576, 4640, 4704, 4768, 4832, 4896, 4960, 5024, 5088, 5152, 5216, 5280, 5344, 5408, 5472, 5536, 5600, 5664, 5728, 5792, 5856, 5920, 5984, 6048 ) + // + %pb0 = vector.transfer_read %0[%c0], %f0 : tensor<1024xf64>, vector<64xf64> + vector.print %pb0 : vector<64xf64> + + // Release the resources. + bufferization.dealloc_tensor %A : tensor<1024x64xf64, #CSR> + return + } +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -342,7 +342,7 @@ vector.print %pb0 : vector<32xf16> } - // Maps the provided host buffer into the device address space. + // Maps the provided host buffers into the device address space. // Writes from the host are guaranteed to be visible to device // kernels that are launched afterwards. Writes from the device // are guaranteed to be visible on the host after synchronizing @@ -368,6 +368,12 @@ %b : memref<8x32xf16>, %c : memref<16x8xf16>) + // Unmaps the host buffers. + gpu.host_unregister %cast_a : memref<*xf16> + gpu.host_unregister %cast_m : memref<*xi16> + gpu.host_unregister %cast_b : memref<*xf16> + gpu.host_unregister %cast_c : memref<*xf16> + // // Verify computed matrix C. //