diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -1,5 +1,5 @@ // -// NOTE: this test requires gpu-sm80 +// NOTE: this test requires gpu-sm80 and cusparselt // // RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ // RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ @@ -41,7 +41,8 @@ %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 %token16 = gpu.destroy_sp_mat async [%token15] %spmat %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat - %token19 = gpu.memcpy async [%token17] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> + %token18 = gpu.destroy_dn_tensor async [%token17] %dnmat2 + %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16> %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16> %token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16> @@ -69,9 +70,9 @@ %c64 = arith.constant 64 : index // Matrices A, B, C (16x32, 32x16, 16x16). - %a = memref.alloc() : memref<16x32xf16> // 16x32 but 2:4, row-major - %b = memref.alloc() : memref<32x16xf16> // regular dense column-major - %c = memref.alloc() : memref<16x16xf16> // accumulator row-major + %a = memref.alloc() : memref<16x32xf16> // 16x32 with 2:4, row-major + %b = memref.alloc() : memref<32x16xf16> // regular dense column-major + %c = memref.alloc() : memref<16x16xf16> // accumulator row-major // // Setup matrix A. @@ -181,27 +182,8 @@ vector.print %pb0 : vector<16xf16> } - // Maps the provided host buffers into the device address space. - // Writes from the host are guaranteed to be visible to device - // kernels that are launched afterwards. Writes from the device - // are guaranteed to be visible on the host after synchronizing - // with the device kernel completion. - %cast_a = memref.cast %a : memref<16x32xf16> to memref<*xf16> - gpu.host_register %cast_a : memref<*xf16> - %cast_b = memref.cast %b : memref<32x16xf16> to memref<*xf16> - gpu.host_register %cast_b : memref<*xf16> - %cast_c = memref.cast %c : memref<16x16xf16> to memref<*xf16> - gpu.host_register %cast_c : memref<*xf16> - // Call the kernel. - %t1 = arith.constant 1 : index - %t32 = arith.constant 32 : index call @sampled_matmul (%a, %b, %c): (memref<16x32xf16>, memref<32x16xf16>, memref<16x16xf16>) -> () - - // Unmaps the host buffers. - gpu.host_unregister %cast_a : memref<*xf16> - gpu.host_unregister %cast_b : memref<*xf16> - gpu.host_unregister %cast_c : memref<*xf16> // // Verify computed matrix C. @@ -227,7 +209,7 @@ %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16> vector.print %pc0 : vector<16xf16> } - + llvm.call @mgpuDestroySparseLtEnv() : () -> () return }