diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h --- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h @@ -52,6 +52,22 @@ mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop, "any-storage-any-loop", "Enable sparse parallelization for any storage and loop."))}; + PassOptions::Option transfer{ + *this, "data-transfer-strategy", + ::llvm::cl::desc( + "Set the data transfer strategy between the host and the GPUs"), + ::llvm::cl::init(mlir::SparseDataTransferStrategy::kRegularDMA), + llvm::cl::values( + clEnumValN(mlir::SparseDataTransferStrategy::kRegularDMA, + "regular-dma", + "Default option: malloc on host without additional " + "options or care and then use DMA to copy the data"), + clEnumValN(mlir::SparseDataTransferStrategy::kPinnedDMA, "pinned-dma", + "Based on the default option, pin the host memory to " + "accelerate the data transfer"), + clEnumValN(mlir::SparseDataTransferStrategy::kZeroCopy, "zero-copy", + "Use zero-copy to perform the data transfer from the host " + "to the GPU"))}; PassOptions::Option enableIndexReduction{ *this, "enable-index-reduction", @@ -138,8 +154,9 @@ /// Projects out the options for `createSparsificationPass`. SparsificationOptions sparsificationOptions() const { - return SparsificationOptions(parallelization, enableIndexReduction, - enableGPULibgen, enableRuntimeLibrary); + return SparsificationOptions(parallelization, transfer, + enableIndexReduction, enableGPULibgen, + enableRuntimeLibrary); } /// Projects out the options for `createSparseTensorConversionPass`. diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h @@ -44,19 +44,25 @@ // TODO: support reduction parallelization too? }; +enum class SparseDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA }; + #define GEN_PASS_DECL #include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc" /// Options for the Sparsification pass. struct SparsificationOptions { - SparsificationOptions(SparseParallelizationStrategy p, bool idxReduc, + SparsificationOptions(SparseParallelizationStrategy p, + SparseDataTransferStrategy t, bool idxReduc, bool gpuLibgen, bool enableRT) - : parallelizationStrategy(p), enableIndexReduction(idxReduc), - enableGPULibgen(gpuLibgen), enableRuntimeLibrary(enableRT) {} + : parallelizationStrategy(p), dataTransferStrategy(t), + enableIndexReduction(idxReduc), enableGPULibgen(gpuLibgen), + enableRuntimeLibrary(enableRT) {} SparsificationOptions() - : SparsificationOptions(SparseParallelizationStrategy::kNone, false, + : SparsificationOptions(SparseParallelizationStrategy::kNone, + SparseDataTransferStrategy::kRegularDMA, false, false, true) {} SparseParallelizationStrategy parallelizationStrategy; + SparseDataTransferStrategy dataTransferStrategy; bool enableIndexReduction; bool enableGPULibgen; bool enableRuntimeLibrary; @@ -211,8 +217,8 @@ void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads); -void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, - bool enableRT); +void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT, + SparseDataTransferStrategy transfer); std::unique_ptr createSparseGPUCodegenPass(); std::unique_ptr createSparseGPUCodegenPass(unsigned numThreads); diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -102,6 +102,19 @@ clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop, "any-storage-any-loop", "Enable sparse parallelization for any storage and loop."))}]>, + Option<"transfer", "data-transfer-strategy", "mlir::SparseDataTransferStrategy", + "mlir::SparseDataTransferStrategy::kRegularDMA", + "Set the data transfer strategy", [{llvm::cl::values( + clEnumValN(mlir::SparseDataTransferStrategy::kRegularDMA, + "regular-dma", + "Default option: malloc on host without additional " + "options or care and then use DMA to copy the data"), + clEnumValN(mlir::SparseDataTransferStrategy::kPinnedDMA, "pinned-dma", + "Based on the default option, pin the host memory to " + "accelerate the data transfer"), + clEnumValN(mlir::SparseDataTransferStrategy::kZeroCopy, "zero-copy", + "Use zero-copy to perform the data transfer from the host " + "to the GPU"))}]>, Option<"enableGPULibgen", "enable-gpu-libgen", "bool", "false", "Enable GPU acceleration by means of direct library calls (like cuSPARSE)">, @@ -110,6 +123,7 @@ ]; } + def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp"> { let summary = "Applies sparse tensor rewriting rules after sparsification"; let description = [{ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -461,14 +461,18 @@ } /// Match and rewrite SpMV kernel. -static LogicalResult rewriteSpMV(PatternRewriter &rewriter, - linalg::GenericOp op, bool enableRT) { +static LogicalResult +rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value a = op.getOperand(0); Value x = op.getOperand(1); Value y = op.getOperand(2); // we have y = Ax SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // Only admissible sparse matrix format and dense vectors. bool isCOO = false; SparseTensorType aTp = getSparseTensorType(a); @@ -485,15 +489,27 @@ Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0); Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1); Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memR); Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT); + if (memC && dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memC); Value memV = genToValues(rewriter, loc, a); - Value rowA = genAllocCopy(rewriter, loc, memR, tokens); - Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); - Value valA = genAllocCopy(rewriter, loc, memV, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memV); + Value rowA = isZeroCopy ? memR : genAllocCopy(rewriter, loc, memR, tokens); + Value colA = + memC ? (isZeroCopy ? memC : genAllocCopy(rewriter, loc, memC, tokens)) + : Value(); + Value valA = isZeroCopy ? memV : genAllocCopy(rewriter, loc, memV, tokens); Value memX = genTensorToMemref(rewriter, loc, x); - Value vecX = genAllocCopy(rewriter, loc, memX, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memX); + Value vecX = isZeroCopy ? memX : genAllocCopy(rewriter, loc, memX, tokens); Value memY = genTensorToMemref(rewriter, loc, y); - Value vecY = genAllocCopy(rewriter, loc, memY, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memY); + Value vecY = isZeroCopy ? memY : genAllocCopy(rewriter, loc, memY, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); @@ -559,14 +575,18 @@ } /// Match and rewrite SpMM kernel. -static LogicalResult rewriteSpMM(PatternRewriter &rewriter, - linalg::GenericOp op, bool enableRT) { +static LogicalResult +rewriteSpMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value a = op.getOperand(0); Value b = op.getOperand(1); Value c = op.getOperand(2); // we have C = AB SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // Only admissible sparse matrix format and dense matrices. bool isCOO = false; SparseTensorType aTp = getSparseTensorType(a); @@ -584,15 +604,27 @@ Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1); Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1); Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memR); Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT); + if (memC && dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memC); Value memV = genToValues(rewriter, loc, a); - Value rowA = genAllocCopy(rewriter, loc, memR, tokens); - Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); - Value valA = genAllocCopy(rewriter, loc, memV, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memV); + Value rowA = isZeroCopy ? memR : genAllocCopy(rewriter, loc, memR, tokens); + Value colA = + memC ? (isZeroCopy ? memC : genAllocCopy(rewriter, loc, memC, tokens)) + : Value(); + Value valA = isZeroCopy ? memV : genAllocCopy(rewriter, loc, memV, tokens); Value bufB = genTensorToMemref(rewriter, loc, b); - Value matB = genAllocCopy(rewriter, loc, bufB, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufB); + Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens); Value bufC = genTensorToMemref(rewriter, loc, c); - Value matC = genAllocCopy(rewriter, loc, bufC, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufC); + Value matC = isZeroCopy ? bufC : genAllocCopy(rewriter, loc, bufC, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); @@ -662,24 +694,34 @@ } // Match and rewrite 2:4 SpMM kernels. -static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter, - linalg::GenericOp op) { +static LogicalResult +rewrite2To4SpMM(PatternRewriter &rewriter, linalg::GenericOp op, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value A = op.getOperand(0); Value B = op.getOperand(1); Value C = op.getOperand(2); // we have C = AB SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // All input should be dense tensors. if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C)) return failure(); Value bufA = genTensorToMemref(rewriter, loc, A); - Value matA = genAllocCopy(rewriter, loc, bufA, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufA); + Value matA = isZeroCopy ? bufA : genAllocCopy(rewriter, loc, bufA, tokens); Value bufB = genTensorToMemref(rewriter, loc, B); - Value matB = genAllocCopy(rewriter, loc, bufB, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufB); + Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens); Value bufC = genTensorToMemref(rewriter, loc, C); - Value matC = genAllocCopy(rewriter, loc, bufC, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufC); + Value matC = isZeroCopy ? bufC : genAllocCopy(rewriter, loc, bufC, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); Value szm = linalg::createOrFoldDimOp(rewriter, loc, matA, 0); @@ -765,14 +807,18 @@ } /// Match and rewrite SDDMM kernel. -static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, - linalg::GenericOp op, bool enableRT) { +static LogicalResult +rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT, + SparseDataTransferStrategy dataTransferStrategy) { Location loc = op.getLoc(); Value a = op.getOperand(0); Value b = op.getOperand(1); Value c = op.getOperand(2); SmallVector tokens; + bool isZeroCopy = + dataTransferStrategy == SparseDataTransferStrategy::kZeroCopy; + // Only admissible sparse matrix format and dense matrices, no COO. bool isCOO = false; SparseTensorType aTp = getSparseTensorType(a); @@ -793,15 +839,27 @@ Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1); Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1); Value bufA = genTensorToMemref(rewriter, loc, a); - Value matA = genAllocCopy(rewriter, loc, bufA, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufA); + Value matA = isZeroCopy ? bufA : genAllocCopy(rewriter, loc, bufA, tokens); Value bufB = genTensorToMemref(rewriter, loc, b); - Value matB = genAllocCopy(rewriter, loc, bufB, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, bufB); + Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens); Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memR); Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT); + if (memC && dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memC); Value memV = genToValues(rewriter, loc, c); - Value rowC = genAllocCopy(rewriter, loc, memR, tokens); - Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); - Value valC = genAllocCopy(rewriter, loc, memV, tokens); + if (dataTransferStrategy != SparseDataTransferStrategy::kRegularDMA) + genHostRegisterMemref(rewriter, loc, memV); + Value rowC = isZeroCopy ? memR : genAllocCopy(rewriter, loc, memR, tokens); + Value colC = + memC ? (isZeroCopy ? memC : genAllocCopy(rewriter, loc, memC, tokens)) + : Value(); + Value valC = isZeroCopy ? memV : genAllocCopy(rewriter, loc, memV, tokens); genBlockingWait(rewriter, loc, tokens); tokens.clear(); @@ -976,8 +1034,8 @@ struct LinalgOpRewriter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; - LinalgOpRewriter(MLIRContext *context, bool rt) - : OpRewritePattern(context), enableRT(rt) {} + LinalgOpRewriter(MLIRContext *context, bool rt, SparseDataTransferStrategy t) + : OpRewritePattern(context), enableRT(rt), dataTransferStrategy(t) {} LogicalResult matchAndRewrite(linalg::GenericOp op, PatternRewriter &rewriter) const override { @@ -1003,7 +1061,7 @@ linalg::isReductionIterator(iteratorTypes[1]) && // TODO: add transposed {i, j} maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) { - return rewriteSpMV(rewriter, op, enableRT); + return rewriteSpMV(rewriter, op, enableRT, dataTransferStrategy); } // Recognize a SpMM kernel. @@ -1015,9 +1073,9 @@ // TODO: maybe add transposed {i, j} in future maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) { if (op->getAttr("DENSE24")) - return rewrite2To4SpMM(rewriter, op); + return rewrite2To4SpMM(rewriter, op, dataTransferStrategy); - return rewriteSpMM(rewriter, op, enableRT); + return rewriteSpMM(rewriter, op, enableRT, dataTransferStrategy); } // Recognize a SDDMM kernel. @@ -1029,7 +1087,7 @@ // TODO: maybe add transposed {i, j} in future maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumReductionOfMulUnary(op)) { - return rewriteSDDMM(rewriter, op, enableRT); + return rewriteSDDMM(rewriter, op, enableRT, dataTransferStrategy); } return failure(); @@ -1037,6 +1095,7 @@ private: bool enableRT; + SparseDataTransferStrategy dataTransferStrategy; }; } // namespace @@ -1056,7 +1115,8 @@ patterns.add(patterns.getContext(), numThreads); } -void mlir::populateSparseGPULibgenPatterns(RewritePatternSet &patterns, - bool enableRT) { - patterns.add(patterns.getContext(), enableRT); +void mlir::populateSparseGPULibgenPatterns( + RewritePatternSet &patterns, bool enableRT, + SparseDataTransferStrategy transfer) { + patterns.add(patterns.getContext(), enableRT, transfer); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp @@ -65,6 +65,7 @@ SparsificationPass(const SparsificationPass &pass) = default; SparsificationPass(const SparsificationOptions &options) { parallelization = options.parallelizationStrategy; + transfer = options.dataTransferStrategy; enableIndexReduction = options.enableIndexReduction; enableGPULibgen = options.enableGPULibgen; enableRuntimeLibrary = options.enableRuntimeLibrary; @@ -73,12 +74,13 @@ void runOnOperation() override { auto *ctx = &getContext(); // Translate strategy flags to strategy options. - SparsificationOptions options(parallelization, enableIndexReduction, - enableGPULibgen, enableRuntimeLibrary); + SparsificationOptions options(parallelization, transfer, + enableIndexReduction, enableGPULibgen, + enableRuntimeLibrary); // Apply GPU libgen (if requested), sparsification, and cleanup rewriting. RewritePatternSet patterns(ctx); if (enableGPULibgen) { - populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary); + populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary, transfer); } populateSparsificationPatterns(patterns, options); scf::ForOp::getCanonicalizationPatterns(patterns, ctx); diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -1,14 +1,16 @@ // // NOTE: this test requires gpu-sm80 and cusparselt // -// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ -// RUN: %s \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// DEFINE: %{compile} = mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ +// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ +// DEFINE: %s +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s + +// RUN: %{compile} | %{run} module { llvm.func @mgpuCreateSparseLtEnv() diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir @@ -1,25 +1,26 @@ // // NOTE: this test requires gpu-sm80 // +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s +// +// // with RT lib (SoA COO): // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=true" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=pinned-dma" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=zero-copy" | %{run} // // without RT lib (AoS COO): note, may fall back to CPU // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=false" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=pinned-dma" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=zero-copy" | %{run} #SortedCOO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ] diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir @@ -1,25 +1,26 @@ // // NOTE: this test requires gpu-sm80 // +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e main --entry-point-result=void \ +// DEFINE: | FileCheck %s +// // with RT lib (SoA COO): // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=true" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=pinned-memory" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=zero-copy" | %{run} // // without RT lib (AoS COO): note, may fall back to CPU // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e main --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=false" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=pinned-memory" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=zero-copy" | %{run} +// #SortedCOO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ] diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir @@ -1,28 +1,27 @@ // // NOTE: this test requires gpu-sm80 // +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ +// DEFINE: mlir-cpu-runner \ +// DEFINE: --shared-libs=%mlir_cuda_runtime \ +// DEFINE: --shared-libs=%mlir_c_runner_utils \ +// DEFINE: --e entry --entry-point-result=void \ +// DEFINE: | FileCheck %s +// // with RT lib: // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ -// RUN: mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e entry --entry-point-result=void \ -// RUN: | FileCheck %s +// RUN: %{compile} enable-runtime-library=true" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=pinned-memory" | %{run} +// RUN: %{compile} enable-runtime-library=true data-transfer-strategy=zero-copy" | %{run} // // without RT lib: // -// RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ -// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ -// RUN: mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_cuda_runtime \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --e entry --entry-point-result=void \ -// RUN: | FileCheck %s -// +// RUN: %{compile} enable-runtime-library=false" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=pinned-memory" | %{run} +// RUN: %{compile} enable-runtime-library=false data-transfer-strategy=zero-copy" | %{run} +// !Filename = !llvm.ptr