diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -601,7 +601,7 @@ tokens.clear(); // Done. - rewriter.replaceOpWithNewOp(op, matC); + rewriter.replaceOpWithNewOp(op, bufC); return success(); } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -64,7 +64,7 @@ // CHECK: %[[VAL_64:.*]] = gpu.memcpy async {{\[}}%[[VAL_63]]] %[[VAL_34]], %[[VAL_38]] : memref, memref // CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_38]] : memref // CHECK: gpu.wait {{\[}}%[[VAL_65]]] -// CHECK: %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_38]] : memref +// CHECK: %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_34]] : memref // CHECK: return %[[VAL_66]] : tensor // CHECK: } func.func @matmul(%A: tensor, %B: tensor, %C_in: tensor) -> tensor { diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir @@ -0,0 +1,178 @@ +// +// NOTE: this test requires gpu-sm80 +// +// with RT lib (SoA COO): +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --e main --entry-point-result=void \ +// RUN: | FileCheck %s +// +// without RT lib (AoS COO): note, may fall back to CPU +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --e main --entry-point-result=void \ +// RUN: | FileCheck %s + +#SortedCOO = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed-nu", "singleton" ] +}> + +#CSR = #sparse_tensor.encoding<{ + lvlTypes = [ "dense", "compressed" ], + posWidth = 32, + crdWidth = 32 +}> + +module { + // Computes C = A x B with A sparse COO. + func.func @matmulCOO(%A: tensor<8x8xf32, #SortedCOO>, + %B: tensor<8x8xf32>, + %C: tensor<8x8xf32>) -> tensor<8x8xf32> { + %D = linalg.matmul + ins(%A, %B: tensor<8x8xf32, #SortedCOO>, tensor<8x8xf32>) + outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32> + return %D: tensor<8x8xf32> + } + + // Computes C = A x B with A sparse CSR. + func.func @matmulCSR(%A: tensor<8x8xf32, #CSR>, + %B: tensor<8x8xf32>, + %C: tensor<8x8xf32>) -> tensor<8x8xf32> { + %D = linalg.matmul + ins(%A, %B: tensor<8x8xf32, #CSR>, tensor<8x8xf32>) + outs(%C: tensor<8x8xf32>) -> tensor<8x8xf32> + return %D: tensor<8x8xf32> + } + + func.func @dump(%mat: tensor<8x8xf32>) { + %f0 = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %r0 = vector.transfer_read %mat[%c0,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r0 : vector<8xf32> + %r1 = vector.transfer_read %mat[%c1,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r1 : vector<8xf32> + %r2 = vector.transfer_read %mat[%c2,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r2 : vector<8xf32> + %r3 = vector.transfer_read %mat[%c3,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r3 : vector<8xf32> + %r4 = vector.transfer_read %mat[%c4,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r4 : vector<8xf32> + %r5 = vector.transfer_read %mat[%c5,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r5 : vector<8xf32> + %r6 = vector.transfer_read %mat[%c6,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r6 : vector<8xf32> + %r7 = vector.transfer_read %mat[%c7,%c0], %f0 : tensor<8x8xf32>, vector<8xf32> + vector.print %r7 : vector<8xf32> + return + } + + // + // Main driver. + // + func.func @main() { + %f0 = arith.constant 0.0 : f32 + %f1 = arith.constant 1.0 : f32 + + // Stress test with a dense matrix DA. + %DA = tensor.generate { + ^bb0(%i: index, %j: index): + %k = arith.addi %i, %j : index + %l = arith.index_cast %k : index to i64 + %f = arith.uitofp %l : i64 to f32 + tensor.yield %f : f32 + } : tensor<8x8xf32> + + // Convert to a "sparse" matrix A. + %Acoo = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO> + %Acsr = sparse_tensor.convert %DA : tensor<8x8xf32> to tensor<8x8xf32, #CSR> + + // Initial C matrices. + %C0 = tensor.generate { + ^bb0(%i: index, %j: index): + tensor.yield %f0 : f32 + } : tensor<8x8xf32> + %C1 = tensor.generate { + ^bb0(%i: index, %j: index): + tensor.yield %f1 : f32 + } : tensor<8x8xf32> + + // Call the kernels. + %0 = call @matmulCOO(%Acoo, %DA, %C0) : (tensor<8x8xf32, #SortedCOO>, + tensor<8x8xf32>, + tensor<8x8xf32>) -> tensor<8x8xf32> + %1 = call @matmulCSR(%Acsr, %DA, %C0) : (tensor<8x8xf32, #CSR>, + tensor<8x8xf32>, + tensor<8x8xf32>) -> tensor<8x8xf32> + %2 = call @matmulCOO(%Acoo, %DA, %C1) : (tensor<8x8xf32, #SortedCOO>, + tensor<8x8xf32>, + tensor<8x8xf32>) -> tensor<8x8xf32> + %3 = call @matmulCSR(%Acsr, %DA, %C1) : (tensor<8x8xf32, #CSR>, + tensor<8x8xf32>, + tensor<8x8xf32>) -> tensor<8x8xf32> + + // + // Sanity check on results. + // + // CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 ) + // CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 ) + // CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 ) + // CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 ) + // CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 ) + // CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 ) + // CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 ) + // CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 ) + // + // CHECK: ( 140, 168, 196, 224, 252, 280, 308, 336 ) + // CHECK-NEXT: ( 168, 204, 240, 276, 312, 348, 384, 420 ) + // CHECK-NEXT: ( 196, 240, 284, 328, 372, 416, 460, 504 ) + // CHECK-NEXT: ( 224, 276, 328, 380, 432, 484, 536, 588 ) + // CHECK-NEXT: ( 252, 312, 372, 432, 492, 552, 612, 672 ) + // CHECK-NEXT: ( 280, 348, 416, 484, 552, 620, 688, 756 ) + // CHECK-NEXT: ( 308, 384, 460, 536, 612, 688, 764, 840 ) + // CHECK-NEXT: ( 336, 420, 504, 588, 672, 756, 840, 924 ) + // + // CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 ) + // CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 ) + // CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 ) + // CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 ) + // CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 ) + // CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 ) + // CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 ) + // CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 ) + // + // CHECK: ( 141, 169, 197, 225, 253, 281, 309, 337 ) + // CHECK-NEXT: ( 169, 205, 241, 277, 313, 349, 385, 421 ) + // CHECK-NEXT: ( 197, 241, 285, 329, 373, 417, 461, 505 ) + // CHECK-NEXT: ( 225, 277, 329, 381, 433, 485, 537, 589 ) + // CHECK-NEXT: ( 253, 313, 373, 433, 493, 553, 613, 673 ) + // CHECK-NEXT: ( 281, 349, 417, 485, 553, 621, 689, 757 ) + // CHECK-NEXT: ( 309, 385, 461, 537, 613, 689, 765, 841 ) + // CHECK-NEXT: ( 337, 421, 505, 589, 673, 757, 841, 925 ) + // + call @dump(%0) : (tensor<8x8xf32>) -> () + call @dump(%1) : (tensor<8x8xf32>) -> () + call @dump(%2) : (tensor<8x8xf32>) -> () + call @dump(%3) : (tensor<8x8xf32>) -> () + + // Release the resources. + bufferization.dealloc_tensor %Acoo : tensor<8x8xf32, #SortedCOO> + bufferization.dealloc_tensor %Acsr : tensor<8x8xf32, #CSR> + + return + } +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir @@ -11,7 +11,15 @@ // RUN: --e main --entry-point-result=void \ // RUN: | FileCheck %s // -// TODO: without RT lib (AoS COO): +// without RT lib (AoS COO): note, may fall back to CPU +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --e main --entry-point-result=void \ +// RUN: | FileCheck %s #SortedCOO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ] @@ -42,6 +50,7 @@ func.func @main() { %f0 = arith.constant 0.0 : f64 + %f1 = arith.constant 1.0 : f64 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -52,11 +61,11 @@ %l = arith.index_cast %k : index to i64 %f = arith.uitofp %l : i64 to f64 tensor.yield %f : f64 - } : tensor<1024x64xf64> + } : tensor<64x64xf64> // Convert to a "sparse" m x n matrix A. - %Acoo = sparse_tensor.convert %DA : tensor<1024x64xf64> to tensor - %Acsr = sparse_tensor.convert %DA : tensor<1024x64xf64> to tensor + %Acoo = sparse_tensor.convert %DA : tensor<64x64xf64> to tensor + %Acsr = sparse_tensor.convert %DA : tensor<64x64xf64> to tensor // Initialize dense vector with n elements: // (1, 2, 3, 4, ..., n) @@ -69,26 +78,46 @@ tensor.yield %f : f64 } : tensor - // Initialize dense vector to m zeros. + // Initialize dense vectors to m zeros and m ones. %d0 = tensor.dim %Acoo, %c0 : tensor - %y = tensor.generate %d0 { + %y0 = tensor.generate %d0 { ^bb0(%i : index): tensor.yield %f0 : f64 } : tensor + %y1 = tensor.generate %d0 { + ^bb0(%i : index): + tensor.yield %f1 : f64 + } : tensor // Call the kernels. - %0 = call @matvecCOO(%Acoo, %x, %y) : (tensor, tensor, tensor) -> tensor - %1 = call @matvecCSR(%Acsr, %x, %y) : (tensor, tensor, tensor) -> tensor + %0 = call @matvecCOO(%Acoo, %x, %y0) : (tensor, + tensor, + tensor) -> tensor + %1 = call @matvecCSR(%Acsr, %x, %y0) : (tensor, + tensor, + tensor) -> tensor + %2 = call @matvecCOO(%Acoo, %x, %y1) : (tensor, + tensor, + tensor) -> tensor + %3 = call @matvecCSR(%Acsr, %x, %y1) : (tensor, + tensor, + tensor) -> tensor // - // Sanity check on results. + // Sanity check on the results. // // CHECK-COUNT-2: ( 87360, 89440, 91520, 93600, 95680, 97760, 99840, 101920, 104000, 106080, 108160, 110240, 112320, 114400, 116480, 118560, 120640, 122720, 124800, 126880, 128960, 131040, 133120, 135200, 137280, 139360, 141440, 143520, 145600, 147680, 149760, 151840, 153920, 156000, 158080, 160160, 162240, 164320, 166400, 168480, 170560, 172640, 174720, 176800, 178880, 180960, 183040, 185120, 187200, 189280, 191360, 193440, 195520, 197600, 199680, 201760, 203840, 205920, 208000, 210080, 212160, 214240, 216320, 218400 ) // + // CHECK-COUNT-2: ( 87361, 89441, 91521, 93601, 95681, 97761, 99841, 101921, 104001, 106081, 108161, 110241, 112321, 114401, 116481, 118561, 120641, 122721, 124801, 126881, 128961, 131041, 133121, 135201, 137281, 139361, 141441, 143521, 145601, 147681, 149761, 151841, 153921, 156001, 158081, 160161, 162241, 164321, 166401, 168481, 170561, 172641, 174721, 176801, 178881, 180961, 183041, 185121, 187201, 189281, 191361, 193441, 195521, 197601, 199681, 201761, 203841, 205921, 208001, 210081, 212161, 214241, 216321, 218401 ) + // %pb0 = vector.transfer_read %0[%c0], %f0 : tensor, vector<64xf64> vector.print %pb0 : vector<64xf64> %pb1 = vector.transfer_read %1[%c0], %f0 : tensor, vector<64xf64> vector.print %pb1 : vector<64xf64> + %pb2 = vector.transfer_read %2[%c0], %f0 : tensor, vector<64xf64> + vector.print %pb2 : vector<64xf64> + %pb3 = vector.transfer_read %3[%c0], %f0 : tensor, vector<64xf64> + vector.print %pb3 : vector<64xf64> // Release the resources. bufferization.dealloc_tensor %Acoo : tensor