diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir @@ -1,8 +1,21 @@ // // NOTE: this test requires gpu-sm80 // +// with RT lib: +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ +// RUN: mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --e entry --entry-point-result=void \ +// RUN: | FileCheck %s +// +// without RT lib: +// // RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ // RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ // RUN: mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ @@ -21,20 +34,21 @@ indexing_maps = [ affine_map<(i,j,k) -> (i,k)>, // A affine_map<(i,j,k) -> (k,j)>, // B - affine_map<(i,j,k) -> (i,j)> // S (out) + affine_map<(i,j,k) -> (i,j)> // S (in/out) ], iterator_types = ["parallel", "parallel", "reduction"], - doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)" + doc = "S(i,j) += spy[S(i,j)] x SUM_k A(i,k) B(k,j)" } // // Integration test that lowers a kernel annotated as sparse to -// actual sparse code, initializes a matching sparse storage scheme -// from file, and runs the resulting code with the JIT compiler. +// actual sparse code, initializes sparse storage schemes, and +// runs the resulting code with the JIT compiler. // module { // - // A kernel that computes a sampled matrix matrix multiplication. + // A kernel that computes a sampled dense matrix matrix multiplication + // using a "spy" function and in-place update of the sampling sparse matrix. // func.func @sampled_dense_dense(%args: tensor, %arga: tensor, @@ -64,7 +78,7 @@ func.func private @getTensorFilename(index) -> (!Filename) // - // Main driver that reads matrix from file and calls the sparse kernel. + // Main driver. // func.func @entry() { %d0 = arith.constant 0.0 : f32 @@ -74,11 +88,6 @@ %c10 = arith.constant 10 : index // Initialize dense matrices. - %x = tensor.generate %c5, %c5 { - ^bb0(%i : index, %j : index): - tensor.yield %d0 : f32 - } : tensor - %a = tensor.generate %c5, %c10 { ^bb0(%i: index, %j: index): %p = arith.addi %i, %c1 : index @@ -86,7 +95,6 @@ %d = arith.sitofp %q : i32 to f32 tensor.yield %d : f32 } : tensor - %b = tensor.generate %c10, %c5 { ^bb0(%i: index, %j: index): %p = arith.addi %j, %c1 : index @@ -104,15 +112,42 @@ : (tensor, tensor, tensor) -> tensor + // // Print the result for verification. // // CHECK: ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255 ) + // %vm = sparse_tensor.values %0 : tensor to memref %vv = vector.transfer_read %vm[%c0], %d0 : memref, vector<9xf32> vector.print %vv : vector<9xf32> + // Create a much sparser sampling matrix. + %t = arith.constant sparse<[[0,0], [0,1], [1,0], [3,4], [7,7]], + [1.0, 2.0, 3.0, 4.0, 5.0] + > : tensor<8x8xf32> + %q = sparse_tensor.convert %t : tensor<8x8xf32> to tensor + %a2 = arith.constant dense<2.0> : tensor<8x8xf32> + %b1 = arith.constant dense<1.0> : tensor<8x8xf32> + %a2c = tensor.cast %a2 : tensor<8x8xf32> to tensor + %b1c = tensor.cast %b1 : tensor<8x8xf32> to tensor + + // Call the kernel again. + %1 = call @sampled_dense_dense(%q, %a2c, %b1c) + : (tensor, + tensor, tensor) -> tensor + + // + // Print the result for verification. + // + // CHECK: ( ( 17, 18, 0, 0, 0, 0, 0, 0 ), ( 19, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 20, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 21 ) ) + // + %d = sparse_tensor.convert %1 : tensor to tensor + %mm = vector.transfer_read %d[%c0, %c0], %d0 : tensor, vector<8x8xf32> + vector.print %mm : vector<8x8xf32> + // Release the resources. bufferization.dealloc_tensor %0 : tensor + bufferization.dealloc_tensor %1 : tensor return }