diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1789,6 +1789,40 @@ }]; } + +def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> { + let summary = "Create sparse matrix with 2:4 sparsity operation"; + let description = [{ + The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense + format with 2:4 sparsity. + The buffers must already be copied from the host to the device prior to + using this operation. The operation returns a handle to the sparse + matrix descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %spmat, %token = gpu.create_2to4_spmat async [%dep] %mem, %size : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Index:$rows, + Index:$cols, + AnyMemRef:$memref); + let results = (outs Res:$spMat, + Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $rows `,` $cols `,` $memref attr-dict `:` type($memref) + }]; +} + def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> { let summary = "Destroy sparse matrix operation"; let description = [{ @@ -1960,7 +1994,7 @@ Example: ```mlir - %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC + %bufferszs, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 ``` }]; @@ -1971,11 +2005,12 @@ GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC); - let results = (outs Res:$bufferSz, + let results = (outs Res]>>:$bufferSzs, Optional:$asyncToken); let builders = [OpBuilder<(ins - "::mlir::Type":$bufferSz, + "::mlir::Type":$bufferSzs, "::mlir::Type":$asyncToken, "::mlir::ValueRange":$asyncDependencies, "::mlir::Value":$env, @@ -1984,17 +2019,17 @@ "::mlir::Value":$dnmatC), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, + return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, env, modeA, modeB, spmatA, dnmatB, dnmatC);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) }]; } -def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> { +def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "SpMM operation"; let description = [{ The `gpu.spmm` operation performs the SpMM operation on the given sparse and @@ -2024,7 +2059,7 @@ GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC, - AnyMemRef:$buffer); + Variadic:$buffers); let results = (outs Optional:$asyncToken); let builders = [OpBuilder<(ins @@ -2034,16 +2069,16 @@ "::mlir::Value":$spmatA, "::mlir::Value":$dnmatB, "::mlir::Value":$dnmatC, - "::mlir::Value":$buffer), [{ + "::mlir::ValueRange":$buffers), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, - modeB, spmatA, dnmatB, dnmatC, buffer);}]> + modeB, spmatA, dnmatB, dnmatC, buffers);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -230,6 +230,14 @@ {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, llvmInt32Type, llvmPointerType /* void *stream */}}; +#if MLIR_CUDA_CUSPARSELT_ENABLED + FunctionCallBuilder create2To4SpMatCallBuilder = { + "mgpuCreateCoo", + llvmPointerType, + {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmPointerType /* void *stream */}}; +#endif FunctionCallBuilder destroySpMatCallBuilder = { "mgpuDestroySpMat", llvmVoidType, @@ -559,6 +567,20 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern( + typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::Create2To4SpMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + class ConvertDestroySpMatOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -688,6 +710,29 @@ return builder.create(loc, function, arguments); } +static bool is2To4Sparsity(Value spMat) { + // TODO: DefaultValuedAttr + if (auto op = spMat.getDefiningOp()) + return true; + if (auto op = spMat.getDefiningOp()) + return false; + if (auto op = spMat.getDefiningOp()) + return false; + llvm_unreachable("cannot find spmat def"); +} + +static const char *inferSpMMType(Operation op) { + for (Operation *user : op.getUsers()) { + auto spmmOp = dyn_cast(user); + // if the other operator is 50% sparsity then we should use cusparseLt + if (!spmmOp) + continue; + if (is2To4Sparsity(spmmOp.getSpMatA())) + return "cusparseLt"; + } + return "cusparse"; +} + // Returns whether all operands are of LLVM type. static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter) { @@ -1287,6 +1332,11 @@ llvm::cast(op.getMemref().getType()).getElementType(); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); + // For now, we track the use of the handle and lower it to cusparse/cusparseLt + // accordingly. If in a block, both cusparse and cusparseLt are used, we + // require two separate Creation ops to be the correct logic. In future, we + // may add support to using one handle in sparse tensor / GPU dialect in both + // cusparse and cusparseLt. auto handle = createDnVecCallBuilder .create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream}) @@ -1424,6 +1474,11 @@ return success(); } +// TODO: work on this +LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateCooOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const {} + LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DestroySpMatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1542,8 +1597,8 @@ auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto stream = adaptor.getAsyncDependencies().front(); - Value pBuf = - MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); + Value pBuf = MemRefDescriptor(adaptor.getBuffers().front()) + .allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMMCallBuilder.create(loc, rewriter, @@ -1615,6 +1670,7 @@ ConvertDestroyDnMatOpToGpuRuntimeCallPattern, ConvertCreateCooOpToGpuRuntimeCallPattern, ConvertCreateCsrOpToGpuRuntimeCallPattern, + ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern, ConvertDestroySpMatOpToGpuRuntimeCallPattern, ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern, ConvertSpMVOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -200,15 +200,36 @@ EXCLUDE_FROM_LIBMLIR ) set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14) - target_include_directories(mlir_cuda_runtime - PRIVATE - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - ) - target_link_libraries(mlir_cuda_runtime - PRIVATE - ${CUDA_RUNTIME_LIBRARY} - ${CUDA_CUSPARSE_LIBRARY} - ) + + + # We need the cusparseLT to provide 2:4 sparsity support. + # As of the pre-1.0 version, we suppose the cusparselt is downloaded as an + # archive and extracted in an exclusive directory CUDA_CUSPARSELT_DIR, rather + # than installed by the package manager. This is the same as Nvidia examples. + if (DEFINED CUDA_CUSPARSELT_DIR) + target_include_directories(mlir_cuda_runtime + PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${CUDA_CUSPARSELT_DIR}/include + ) + target_link_libraries(mlir_cuda_runtime + PRIVATE + ${CUDA_RUNTIME_LIBRARY} + ${CUDA_CUSPARSE_LIBRARY} + ${CUDA_CUSPARSELT_DIR}/lib64 + ) + else() + target_include_directories(mlir_cuda_runtime + PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ) + target_link_libraries(mlir_cuda_runtime + PRIVATE + ${CUDA_RUNTIME_LIBRARY} + ${CUDA_CUSPARSE_LIBRARY} + ) + endif() + add_definitions(-DMLIR_CUDA_CUSPARSELT_ENABLED=(defined(CUDA_CUSPARSELT_DIR))) endif() if(MLIR_ENABLE_ROCM_RUNNER) diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -19,6 +19,17 @@ #include "cuda.h" #include "cusparse.h" +// TODO: this is a compile-time (of the mlir compiler) disablement. We may also +// want a run-time (of the mlir compiler) disablement/warning too: cusparseLt +// currently won't work for cuda architecture <8.0 and will trigger a runtime +// (of the CUDA program) error , but it might be great if we could at least +// output a warning when we found the target architecture is <8.0 and the user +// still wants to use cusparseLt. to make sure when lowering gpu sparse dialect +// to llvm calls, the cusparselt calls are disabled for cuda architecture <8.0 +#if MLIR_CUDA_CUSPARSELT_ENABLED +#include "cusparseLt.h" +#endif // MLIR_CUDA_CUSPARSELT_ENABLED + #ifdef _WIN32 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) #else @@ -438,3 +449,136 @@ matB, betap, matC, dtp, CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } + +/// +/// Wrapper methods for the cuSparseLt library. +/// +#if MLIR_CUDA_CUSPARSELT_ENABLED +struct cusparseLtSpMatHandleAndData { + cusparseLtMatDescriptor_t mat; + void *values{nullptr}; + // TODO: the following is associated with the SpMM operator rather than the + // sparse matrix. Create workspace buffers and pass them to the SpMM + // execution. + cusparseLtMatmulAlgSelection_t alg_sel; + cusparseLtMatmulPlan_t plan; +}; +struct cusparseLtDnMatHandleAndData { + cusparseLtMatDescriptor_t mat; + void *values{nullptr}; +}; +struct cusparseLtWorkspaceSizes { + size_t workspace_size; + size_t compressed_size; + size_t compressed_buffer_size; +}; + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateSparseLtEnv(CUstream /*stream*/) { + cusparseLtHandle_t handle = nullptr; + // note that cuSparseLt still uses cusparseStatus_t + CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&handle)) + return reinterpret_cast(handle); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) { + cusparseLtHandle_t handle = reinterpret_cast(h); + CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle)) +} + +// TODO: pass handle ptr +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateCuSparseLtDnMat(intptr_t rows, intptr_t cols, void *values, + int32_t dw, CUstream /*stream*/) { + cusparseLtMatDescriptor_t mat; + cudaDataType_t dtp = dataTp(dw); + // assuming row-major when deciding lda + CUSPARSE_REPORT_IF_ERROR( + cusparseLtDenseDescriptorInit(handlePtr, &mat, rows, cols, /*lda=*/cols, + /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW)) + cusparseLtDnMatHandleAndData matWithData{ + .mat = mat, + .values = values, + }; + return reinterpret_cast(matWithData); +} + +// This can be used to destroy both dense matrices and sparse matrices in +// cusparseLt +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) { + auto matAndData = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) + // destroy the plan associated with the sparse matrix + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan))) +} + +// TODO: pass handle ptr +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCusparseLtCreate2To4SpMat(intptr_t rows, intptr_t cols, void *values, + int32_t dw, CUstream /*stream*/) { + cusparseLtSpMatHandleAndData matWithData; + matWithData.values = values; + + cudaDataType_t dtp = dataTp_cusparseLt(dw); + // assuming row-major when deciding lda + CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( + handlePtr, &(matWithData.mat), rows, cols, /*ld=*/cols, /*alignment=*/16, + dtp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + + return reinterpret_cast(matWithData); +} + +// Several things are being done in this stage, algorithm selection, planning, +// and returning workspace and compressed matrices data buffer sizes. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuCuSparseLtSpMMBufferSize(void *h, void *a, CUstream /*stream*/) { + // TODO: support more advanced settings, e.g., the input right operand is a + // sparse matrix assuming matA is the sparse matrix + auto matA = reinterpret_cast(a); + + CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit( + &handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) + int alg = 0; + CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( + &handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + sizeof(alg))) + cusparseLtMatmulPlan_t plan; + CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &(matWithData.plan), &matmul, + &(matWithData.alg_sel))) + + CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(handle, &(matA.plan), + &(sizes.workspace_size))) + CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(handle, &(matA.plan), + &(sizes.compressed_size), + &(sizes.compressed_buffer_size))) + // avoid zero-alloc + sizes.workspace_size = (sizes.workspace_size == 0 ? 1 : sizes.workspace_size); + sizes.compressed_size = + (sizes.compressed_size == 0 ? 1 : sizes.compressed_size); + sizes.compressed_buffer_size = + (sizes.compressed_buffer_size == 0 ? 1 : sizes.compressed_buffer_size); + return reinterpret_cast(sizes); + + // TODO: operator specific stuff like plan needs to be passed +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuCuSparseLtSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t dw, void *buf, CUstream stream) { + + auto matA = reinterpret_cast(a); + auto matB = reinterpret_cast(b); + ALPHABETA(dw, alpha, beta) + + CHECK_CUSPARSE(cusparseLtSpMMACompress( + &handle, &(matA.plan), dA, dA_compressed, dA_compressedBuffer, stream)) + + // TODO: add support to multi-stream execution + // Perform the matrix multiplication + CHECK_CUSPARSE(cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB, + &beta, dC, dD, d_workspace, &stream, 1)) +} + +#endif // MLIR_CUDA_CUSPARSELT_ENABLED \ No newline at end of file diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -53,7 +53,7 @@ %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_mat async [%token8] %dnmat diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -341,7 +341,7 @@ // CHECK: gpu.create_dn_mat async %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref // CHECK: gpu.spmm_buffer_size async - %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index // CHECK: gpu.spmm async %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref // CHECK: gpu.sddmm_buffer_size async diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -49,7 +49,7 @@ // CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref // CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref // CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref -// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] +// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref // CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]