diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h --- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h @@ -52,6 +52,22 @@ mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop, "any-storage-any-loop", "Enable sparse parallelization for any storage and loop."))}; + PassOptions::Option transfer{ + *this, "data-transfer-strategy", + ::llvm::cl::desc( + "Set the data transfer strategy between the host and the GPUs"), + ::llvm::cl::init(mlir::SparseDataTransferStrategy::kRegularDMA), + llvm::cl::values( + clEnumValN(mlir::SparseDataTransferStrategy::kRegularDMA, + "regular-dma", + "Default option: malloc on host without additional " + "options or care and then use DMA to copy the data"), + clEnumValN(mlir::SparseDataTransferStrategy::kPinnedDMA, "pinned-dma", + "Based on the default option, pin the host memory to " + "accelerate the data transfer"), + clEnumValN(mlir::SparseDataTransferStrategy::kZeroCopy, "zero-copy", + "Use zero-copy to perform the data transfer from the host " + "to the GPU"))}; PassOptions::Option enableIndexReduction{ *this, "enable-index-reduction", @@ -138,8 +154,9 @@ /// Projects out the options for `createSparsificationPass`. SparsificationOptions sparsificationOptions() const { - return SparsificationOptions(parallelization, enableIndexReduction, - enableGPULibgen, enableRuntimeLibrary); + return SparsificationOptions(parallelization, transfer, + enableIndexReduction, enableGPULibgen, + enableRuntimeLibrary); } /// Projects out the options for `createSparseTensorConversionPass`. diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -44,19 +44,25 @@ // TODO: support reduction parallelization too? }; +enum class SparseDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA }; + #define GEN_PASS_DECL #include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc" /// Options for the Sparsification pass. struct SparsificationOptions { - SparsificationOptions(SparseParallelizationStrategy p, bool idxReduc, + SparsificationOptions(SparseParallelizationStrategy p, + SparseDataTransferStrategy t, bool idxReduc, bool gpuLibgen, bool enableRT) - : parallelizationStrategy(p), enableIndexReduction(idxReduc), - enableGPULibgen(gpuLibgen), enableRuntimeLibrary(enableRT) {} + : parallelizationStrategy(p), dataTransferStrategy(t), + enableIndexReduction(idxReduc), enableGPULibgen(gpuLibgen), + enableRuntimeLibrary(enableRT) {} SparsificationOptions() - : SparsificationOptions(SparseParallelizationStrategy::kNone, false, + : SparsificationOptions(SparseParallelizationStrategy::kNone, + SparseDataTransferStrategy::kRegularDMA, false, false, true) {} SparseParallelizationStrategy parallelizationStrategy; + SparseDataTransferStrategy dataTransferStrategy; bool enableIndexReduction; bool enableGPULibgen; bool enableRuntimeLibrary; @@ -211,8 +217,8 @@ void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads); -void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, - bool enableRT); +void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT, + SparseDataTransferStrategy transfer); std::unique_ptr createSparseGPUCodegenPass(); std::unique_ptr createSparseGPUCodegenPass(unsigned numThreads); diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -102,6 +102,19 @@ clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop, "any-storage-any-loop", "Enable sparse parallelization for any storage and loop."))}]>, + Option<"transfer", "data-transfer-strategy", "mlir::SparseDataTransferStrategy", + "mlir::SparseDataTransferStrategy::kRegularDMA", + "Set the data transfer strategy", [{llvm::cl::values( + clEnumValN(mlir::SparseDataTransferStrategy::kRegularDMA, + "regular-dma", + "Default option: malloc on host without additional " + "options or care and then use DMA to copy the data"), + clEnumValN(mlir::SparseDataTransferStrategy::kPinnedDMA, "pinned-dma", + "Based on the default option, pin the host memory to " + "accelerate the data transfer"), + clEnumValN(mlir::SparseDataTransferStrategy::kZeroCopy, "zero-copy", + "Use zero-copy to perform the data transfer from the host " + "to the GPU"))}]>, Option<"enableGPULibgen", "enable-gpu-libgen", "bool", "false", "Enable GPU acceleration by means of direct library calls (like cuSPARSE)">, @@ -110,6 +123,7 @@ ]; } + def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp"> { let summary = "Applies sparse tensor rewriting rules after sparsification"; let description = [{ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -461,14 +461,18 @@ } /// Match and rewrite SpMV kernel. -static LogicalResult rewriteSpMV(PatternRewriter &rewriter, - linalg::GenericOp op, bool enableRT) { +static LogicalResult +rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value a = op.getOperand(0); Value x = op.getOperand(1); Value y = op.getOperand(2); // we have y = Ax SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // Only admissible sparse matrix format and dense vectors. bool isCOO = false; SparseTensorType aTp = getSparseTensorType(a); @@ -487,12 +491,23 @@ Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT); Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT); Value memV = genToValues(rewriter, loc, a); + Value memX = genTensorToMemref(rewriter, loc, x); + Value memY = genTensorToMemref(rewriter, loc, y); + + Value memR_cast, memC_cast, memV_cast, memX_cast, memY_cast; + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + memR_cast = genHostRegisterMemref(rewriter, loc, memR); + if (memC) + memC_cast = genHostRegisterMemref(rewriter, loc, memC); + memV_cast = genHostRegisterMemref(rewriter, loc, memV); + memX_cast = genHostRegisterMemref(rewriter, loc, memX); + memY_cast = genHostRegisterMemref(rewriter, loc, memY); + } + Value rowA = genAllocCopy(rewriter, loc, memR, tokens); Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); Value valA = genAllocCopy(rewriter, loc, memV, tokens); - Value memX = genTensorToMemref(rewriter, loc, x); - Value vecX = genAllocCopy(rewriter, loc, memX, tokens); - Value memY = genTensorToMemref(rewriter, loc, y); + Value vecX = isZeroCopy ? memX : genAllocCopy(rewriter, loc, memX, tokens); Value vecY = genAllocCopy(rewriter, loc, memY, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); @@ -541,16 +556,27 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnY) .getAsyncToken(); + token = genDeallocMemRef(rewriter, loc, rowA, token); if (colA) token = genDeallocMemRef(rewriter, loc, colA, token); token = genDeallocMemRef(rewriter, loc, valA, token); token = genDeallocMemRef(rewriter, loc, buffer, token); - token = genDeallocMemRef(rewriter, loc, vecX, token); + if (!isZeroCopy) + token = genDeallocMemRef(rewriter, loc, vecX, token); token = genCopyMemRef(rewriter, loc, memY, vecY, token); token = genDeallocMemRef(rewriter, loc, vecY, token); tokens.push_back(token); genBlockingWait(rewriter, loc, tokens); + + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + genHostUnregisterMemref(rewriter, loc, memR_cast); + if (memC) + genHostUnregisterMemref(rewriter, loc, memC_cast); + genHostUnregisterMemref(rewriter, loc, memV_cast); + genHostUnregisterMemref(rewriter, loc, memX_cast); + genHostUnregisterMemref(rewriter, loc, memY_cast); + } tokens.clear(); // Done. @@ -559,14 +585,18 @@ } /// Match and rewrite SpMM kernel. -static LogicalResult rewriteSpMM(PatternRewriter &rewriter, - linalg::GenericOp op, bool enableRT) { +static LogicalResult +rewriteSpMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value a = op.getOperand(0); Value b = op.getOperand(1); Value c = op.getOperand(2); // we have C = AB SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // Only admissible sparse matrix format and dense matrices. bool isCOO = false; SparseTensorType aTp = getSparseTensorType(a); @@ -586,12 +616,22 @@ Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT); Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT); Value memV = genToValues(rewriter, loc, a); + Value bufB = genTensorToMemref(rewriter, loc, b); + Value bufC = genTensorToMemref(rewriter, loc, c); + Value memR_cast, memC_cast, memV_cast, bufB_cast, bufC_cast; + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + memR_cast = genHostRegisterMemref(rewriter, loc, memR); + if (memC) + memC_cast = genHostRegisterMemref(rewriter, loc, memC); + memV_cast = genHostRegisterMemref(rewriter, loc, memV); + bufB_cast = genHostRegisterMemref(rewriter, loc, bufB); + bufC_cast = genHostRegisterMemref(rewriter, loc, bufC); + } + Value rowA = genAllocCopy(rewriter, loc, memR, tokens); Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); Value valA = genAllocCopy(rewriter, loc, memV, tokens); - Value bufB = genTensorToMemref(rewriter, loc, b); - Value matB = genAllocCopy(rewriter, loc, bufB, tokens); - Value bufC = genTensorToMemref(rewriter, loc, c); + Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens); Value matC = genAllocCopy(rewriter, loc, bufC, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); @@ -649,11 +689,20 @@ token = genDeallocMemRef(rewriter, loc, colA, token); token = genDeallocMemRef(rewriter, loc, valA, token); token = genDeallocMemRef(rewriter, loc, buffer, token); - token = genDeallocMemRef(rewriter, loc, matB, token); + if (!isZeroCopy) + token = genDeallocMemRef(rewriter, loc, matB, token); token = genCopyMemRef(rewriter, loc, bufC, matC, token); token = genDeallocMemRef(rewriter, loc, matC, token); tokens.push_back(token); genBlockingWait(rewriter, loc, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + genHostUnregisterMemref(rewriter, loc, memR_cast); + if (memC) + genHostUnregisterMemref(rewriter, loc, memC_cast); + genHostUnregisterMemref(rewriter, loc, memV_cast); + genHostUnregisterMemref(rewriter, loc, bufB_cast); + genHostUnregisterMemref(rewriter, loc, bufC_cast); + } tokens.clear(); // Done. @@ -662,23 +711,35 @@ } // Match and rewrite 2:4 SpMM kernels. -static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter, - linalg::GenericOp op) { +static LogicalResult +rewrite2To4SpMM(PatternRewriter &rewriter, linalg::GenericOp op, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value A = op.getOperand(0); Value B = op.getOperand(1); Value C = op.getOperand(2); // we have C = AB SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // All input should be dense tensors. if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C)) return failure(); Value bufA = genTensorToMemref(rewriter, loc, A); - Value matA = genAllocCopy(rewriter, loc, bufA, tokens); Value bufB = genTensorToMemref(rewriter, loc, B); - Value matB = genAllocCopy(rewriter, loc, bufB, tokens); Value bufC = genTensorToMemref(rewriter, loc, C); + + Value bufA_cast, bufB_cast, bufC_cast; + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + bufA_cast = genHostRegisterMemref(rewriter, loc, bufA); + bufB_cast = genHostRegisterMemref(rewriter, loc, bufB); + bufC_cast = genHostRegisterMemref(rewriter, loc, bufC); + } + + Value matA = isZeroCopy ? bufA : genAllocCopy(rewriter, loc, bufA, tokens); + Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens); Value matC = genAllocCopy(rewriter, loc, bufC, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); @@ -753,26 +814,38 @@ token = genDeallocMemRef(rewriter, loc, buffer, token); token = genDeallocMemRef(rewriter, loc, buffer2, token); token = genDeallocMemRef(rewriter, loc, buffer3, token); - token = genDeallocMemRef(rewriter, loc, matA, token); - token = genDeallocMemRef(rewriter, loc, matB, token); + + if (!isZeroCopy) + token = genDeallocMemRef(rewriter, loc, matA, token); + if (!isZeroCopy) + token = genDeallocMemRef(rewriter, loc, matB, token); token = genCopyMemRef(rewriter, loc, bufC, matC, token); token = genDeallocMemRef(rewriter, loc, matC, token); tokens.push_back(token); genBlockingWait(rewriter, loc, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + genHostUnregisterMemref(rewriter, loc, bufA_cast); + genHostUnregisterMemref(rewriter, loc, bufB_cast); + genHostUnregisterMemref(rewriter, loc, bufC_cast); + } tokens.clear(); rewriter.replaceOpWithNewOp(op, bufC); return success(); } /// Match and rewrite SDDMM kernel. -static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, - linalg::GenericOp op, bool enableRT) { +static LogicalResult +rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value a = op.getOperand(0); Value b = op.getOperand(1); Value c = op.getOperand(2); SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // Only admissible sparse matrix format and dense matrices, no COO. bool isCOO = false; SparseTensorType aTp = getSparseTensorType(a); @@ -793,12 +866,23 @@ Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1); Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1); Value bufA = genTensorToMemref(rewriter, loc, a); - Value matA = genAllocCopy(rewriter, loc, bufA, tokens); Value bufB = genTensorToMemref(rewriter, loc, b); - Value matB = genAllocCopy(rewriter, loc, bufB, tokens); Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT); Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT); Value memV = genToValues(rewriter, loc, c); + + Value bufB_cast, bufA_cast, memR_cast, memC_cast, memV_cast; + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + bufB_cast = genHostRegisterMemref(rewriter, loc, bufB); + bufA_cast = genHostRegisterMemref(rewriter, loc, bufA); + memR_cast = genHostRegisterMemref(rewriter, loc, memR); + if (memC) + memC_cast = genHostRegisterMemref(rewriter, loc, memC); + memV_cast = genHostRegisterMemref(rewriter, loc, memV); + } + + Value matA = isZeroCopy ? bufA : genAllocCopy(rewriter, loc, bufA, tokens); + Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens); Value rowC = genAllocCopy(rewriter, loc, memR, tokens); Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); Value valC = genAllocCopy(rewriter, loc, memV, tokens); @@ -849,8 +933,10 @@ token = rewriter.create(loc, tokenTp, token, spMatC) .getAsyncToken(); token = genDeallocMemRef(rewriter, loc, buffer, token); - token = genDeallocMemRef(rewriter, loc, matA, token); - token = genDeallocMemRef(rewriter, loc, matB, token); + if (!isZeroCopy) { + token = genDeallocMemRef(rewriter, loc, matA, token); + token = genDeallocMemRef(rewriter, loc, matB, token); + } token = genDeallocMemRef(rewriter, loc, rowC, token); if (colC) token = genDeallocMemRef(rewriter, loc, colC, token); @@ -858,6 +944,14 @@ token = genDeallocMemRef(rewriter, loc, valC, token); tokens.push_back(token); genBlockingWait(rewriter, loc, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) { + genHostUnregisterMemref(rewriter, loc, bufB_cast); + genHostUnregisterMemref(rewriter, loc, bufA_cast); + genHostUnregisterMemref(rewriter, loc, memR_cast); + if (memC) + genHostUnregisterMemref(rewriter, loc, memC_cast); + genHostUnregisterMemref(rewriter, loc, memV_cast); + } tokens.clear(); // Done. @@ -976,8 +1070,8 @@ struct LinalgOpRewriter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; - LinalgOpRewriter(MLIRContext *context, bool rt) - : OpRewritePattern(context), enableRT(rt) {} + LinalgOpRewriter(MLIRContext *context, bool rt, SparseDataTransferStrategy t) + : OpRewritePattern(context), enableRT(rt), dataTransferStrategy(t) {} LogicalResult matchAndRewrite(linalg::GenericOp op, PatternRewriter &rewriter) const override { @@ -1003,7 +1097,7 @@ linalg::isReductionIterator(iteratorTypes[1]) && // TODO: add transposed {i, j} maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) { - return rewriteSpMV(rewriter, op, enableRT); + return rewriteSpMV(rewriter, op, enableRT, dataTransferStrategy); } // Recognize a SpMM kernel. @@ -1015,9 +1109,9 @@ // TODO: maybe add transposed {i, j} in future maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) { if (op->getAttr("DENSE24")) - return rewrite2To4SpMM(rewriter, op); + return rewrite2To4SpMM(rewriter, op, dataTransferStrategy); - return rewriteSpMM(rewriter, op, enableRT); + return rewriteSpMM(rewriter, op, enableRT, dataTransferStrategy); } // Recognize a SDDMM kernel. @@ -1029,7 +1123,7 @@ // TODO: maybe add transposed {i, j} in future maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumReductionOfMulUnary(op)) { - return rewriteSDDMM(rewriter, op, enableRT); + return rewriteSDDMM(rewriter, op, enableRT, dataTransferStrategy); } return failure(); @@ -1037,6 +1131,7 @@ private: bool enableRT; + SparseDataTransferStrategy dataTransferStrategy; }; } // namespace @@ -1056,7 +1151,8 @@ patterns.add(patterns.getContext(), numThreads); } -void mlir::populateSparseGPULibgenPatterns(RewritePatternSet &patterns, - bool enableRT) { - patterns.add(patterns.getContext(), enableRT); +void mlir::populateSparseGPULibgenPatterns( + RewritePatternSet &patterns, bool enableRT, + SparseDataTransferStrategy transfer) { + patterns.add(patterns.getContext(), enableRT, transfer); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp @@ -65,6 +65,7 @@ SparsificationPass(const SparsificationPass &pass) = default; SparsificationPass(const SparsificationOptions &options) { parallelization = options.parallelizationStrategy; + transfer = options.dataTransferStrategy; enableIndexReduction = options.enableIndexReduction; enableGPULibgen = options.enableGPULibgen; enableRuntimeLibrary = options.enableRuntimeLibrary; @@ -73,12 +74,13 @@ void runOnOperation() override { auto *ctx = &getContext(); // Translate strategy flags to strategy options. - SparsificationOptions options(parallelization, enableIndexReduction, - enableGPULibgen, enableRuntimeLibrary); + SparsificationOptions options(parallelization, transfer, + enableIndexReduction, enableGPULibgen, + enableRuntimeLibrary); // Apply GPU libgen (if requested), sparsification, and cleanup rewriting. RewritePatternSet patterns(ctx); if (enableGPULibgen) { - populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary); + populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary, transfer); } populateSparsificationPatterns(patterns, options); scf::ForOp::getCanonicalizationPatterns(patterns, ctx); diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir @@ -1,13 +1,17 @@ // // NOTE: this test requires gpu-sm80 and cusparselt // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s + +// RUN: %{compile}" | %{run} +// RUN: %{compile} data-transfer-strategy=pinned-dma" | %{run} +// %{compile} data-transfer-strategy=zero-copy" | %{run} #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -1,14 +1,16 @@ // // NOTE: this test requires gpu-sm80 and cusparselt // -// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ -// RUN: %s \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// DEFINE: %{compile} = mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ +// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ +// DEFINE: %s +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s + +// RUN: %{compile} | %{run} module { llvm.func @mgpuCreateSparseLtEnv() diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir @@ -1,25 +1,26 @@ // // NOTE: this test requires gpu-sm80 // +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s +// +// // with RT lib (SoA COO): // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=true" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=pinned-dma" | %{run} +// %{compile} enable-runtime-library=true data-transfer-strategy=zero-copy" | %{run} // // without RT lib (AoS COO): note, may fall back to CPU // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=false" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=pinned-dma" | %{run} +// %{compile} enable-runtime-library=false data-transfer-strategy=zero-copy" | %{run} #SortedCOO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ] diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir @@ -1,25 +1,26 @@ // // NOTE: this test requires gpu-sm80 // +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s +// // with RT lib (SoA COO): // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=true" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=pinned-dma" | %{run} +// %{compile} enable-runtime-library=true data-transfer-strategy=zero-copy" | %{run} // // without RT lib (AoS COO): note, may fall back to CPU // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=false" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=pinned-dma" | %{run} +// %{compile} enable-runtime-library=false data-transfer-strategy=zero-copy" | %{run} +// #SortedCOO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ] diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir @@ -1,28 +1,27 @@ // // NOTE: this test requires gpu-sm80 // +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ +// DEFINE: mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e entry --entry-point-result=void \ +// DEFINE: | FileCheck %s +// // with RT lib: // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ -// RUN: mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e entry --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=true" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=pinned-dma" | %{run} +// %{compile} enable-runtime-library=true data-transfer-strategy=zero-copy" | %{run} // // without RT lib: // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ -// RUN: mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e entry --entry-point-result=void \ -// RUN: | FileCheck %s -// +// RUN: %{compile} enable-runtime-library=false" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=pinned-dma" | %{run} +// %{compile} enable-runtime-library=false data-transfer-strategy=zero-copy" | %{run} +// !Filename = !llvm.ptr