diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1713,6 +1713,22 @@ }]; } +def GPU_Prune2To4SpMatFlag : I32EnumAttr<"Prune2To4SpMatFlag", + "determines whether to prune or prune-check the 2:4 sparse matrix", + [ + I32EnumAttrCase<"NONE", 0>, + I32EnumAttrCase<"PRUNE_ONLY", 1>, + I32EnumAttrCase<"PRUNE_AND_CHECK", 2>, + ]> { + let genSpecializedAttr = 0; + let cppNamespace = GPU_Dialect.cppNamespace; +} + +def GPU_Prune2To4SpMatFlagAttr : EnumAttr{ + let defaultValue = "Prune2To4SpMatFlag::PRUNE_AND_CHECK"; +} + def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> { let summary = "Create sparse matrix with 2:4 sparsity operation"; @@ -1730,20 +1746,21 @@ Example: ```mlir - %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref + %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem, PRUNE_AND_CHECK : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, Index:$rows, Index:$cols, + GPU_Prune2To4SpMatFlagAttr:$pruneFlag, AnyMemRef:$memref); let results = (outs Res:$spMat, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $rows `,` $cols `,` $memref attr-dict `:` type($memref) + $rows `,` $cols `,` $memref `,` $pruneFlag attr-dict `:` type($memref) }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -287,7 +287,7 @@ llvmVoidType, {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /*void *stream*/}}; + llvmPointerType /*void *stream*/, llvmInt32Type /*int32_t prune_flag*/}}; FunctionCallBuilder createCuSparseLtSpMMBuilder = { "mgpuCuSparseLtSpMM", llvmVoidType, @@ -747,6 +747,10 @@ llvm_unreachable("unsupported element type"); } +static gpu::Prune2To4SpMatFlag get2To4PruneFlag(Value spMat) { + auto op = spMat.getDefiningOp(); + return op.getPruneFlag(); +} // TODO: We may want a run-time (of the mlir compiler) disablement/warning: // cusparseLt currently won't work for cuda architecture <8.0 and will trigger a // runtime (of the CUDA program) error , but it might be great if we could at @@ -1628,6 +1632,8 @@ auto stream = adaptor.getAsyncDependencies().front(); Value bufferSize; if (is2To4Sparsity(op.getSpmatA())) { + auto prune_flag = + genConstInt32From(rewriter, loc, get2To4PruneFlag(op.getSpmatA())); auto computeType = genConstInt32From( rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType())); auto three = rewriter.create(loc, getIndexType(), @@ -1637,7 +1643,8 @@ createCuSparseLtSpMMBufferSizeBuilder .create(loc, rewriter, {bufferSize, modeA, modeB, adaptor.getSpmatA(), - adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream}) + adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream, + prune_flag}) .getResult(); auto bufferSizePtr1 = rewriter.create( diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -692,7 +692,8 @@ Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); Operation *spGenA = rewriter.create( - loc, spMatHandleTp, tokenTp, token, szm, szk, matA); + loc, spMatHandleTp, tokenTp, token, szm, szk, + gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -630,9 +630,12 @@ // Several things are being done in this stage, algorithm selection, planning, // and returning workspace and compressed matrices data buffer sizes. +// prune_flag is used to indicate whether pruning and pruning check will happen +// 0 means not prune or prune check, 1 means prune, 2 means prune & prune check extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b, - void *c, int32_t ctp, CUstream stream) { + void *c, int32_t ctp, CUstream stream, + int32_t prune_flag) { assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); // TODO: support more advanced settings, e.g., the input right operand is a // sparse matrix assuming matA is the sparse matrix @@ -662,23 +665,26 @@ &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) // Pruning step (in-place). - CUSPARSE_REPORT_IF_ERROR( - cusparseLtSpMMAPrune(&cusparseLt_env, &(matA->matmul), matA->values, - matA->values, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) + if (prune_flag > 0) + CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune( + &cusparseLt_env, &(matA->matmul), matA->values, matA->values, + CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) // Check structure of A. // Note that this adds a synchronization on the stream. // TODO: Do we want that? - int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream); - CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck( - &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream)) - int valid = 0; - mgpuMemcpy(&valid, dvalid, sizeof(int), stream); - mgpuStreamSynchronize(stream); - mgpuMemFree(dvalid, stream); - if (valid != 0) - fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results " - "will be invalid\n"); + if (prune_flag == 2) { + int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream); + CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck( + &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream)) + int valid = 0; + mgpuMemcpy(&valid, dvalid, sizeof(int), stream); + mgpuStreamSynchronize(stream); + mgpuMemFree(dvalid, stream); + if (valid != 0) + fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results " + "will be invalid\n"); + } CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace( &cusparseLt_env, &(matA->plan), &workspace_size_)) diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -18,7 +18,7 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1: memref + %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1, PRUNE_AND_CHECK: memref %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16 %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir @@ -30,7 +30,7 @@ // CHECK: %[[VAL_27:.*]] = memref.dim %[[VAL_16]], %[[VAL_3]] : memref // CHECK: %[[VAL_28:.*]] = memref.dim %[[VAL_23]], %[[VAL_4]] : memref // CHECK: %[[VAL_29:.*]] = gpu.wait async -// CHECK: %[[VAL_30:.*]], %[[VAL_31:.*]] = gpu.create_2to4_spmat async {{\[}}%[[VAL_29]]] %[[VAL_26]], %[[VAL_27]], %[[VAL_9]] : memref +// CHECK: %[[VAL_30:.*]], %[[VAL_31:.*]] = gpu.create_2to4_spmat async {{\[}}%[[VAL_29]]] %[[VAL_26]], %[[VAL_27]], %[[VAL_9]], PRUNE_AND_CHECK : memref // CHECK: %[[VAL_32:.*]], %[[VAL_33:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_31]]] %[[VAL_16]], %[[VAL_27]], %[[VAL_28]] : index, index into memref // CHECK: %[[VAL_34:.*]], %[[VAL_35:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_33]]] %[[VAL_23]], %[[VAL_26]], %[[VAL_28]] : index, index into memref // CHECK: %[[VAL_36:.*]]:3, %[[VAL_37:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_35]]] %[[VAL_30]], %[[VAL_32]], %[[VAL_34]] : index, index, index into f16 diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -31,7 +31,7 @@ %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16> %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16> %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16> - %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a: memref<16x32xf16> + %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a, PRUNE_AND_CHECK: memref<16x32xf16> %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16> %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16> %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16