Diff 546245

mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	PassOptions::Option<mlir::SparseParallelizationStrategy> parallelization{
"the outer loop."),		"the outer loop."),
clEnumValN(mlir::SparseParallelizationStrategy::kDenseAnyLoop,		clEnumValN(mlir::SparseParallelizationStrategy::kDenseAnyLoop,
"dense-any-loop",		"dense-any-loop",
"Enable dense parallelization for any loop."),		"Enable dense parallelization for any loop."),
clEnumValN(		clEnumValN(
mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,		mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",		"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))};		"Enable sparse parallelization for any storage and loop."))};
		PassOptions::Option<mlir::GPUDataTransferStrategy> gpuDataTransfer{
		aartbikUnsubmitted Done Reply Inline Actions can we make this a bit more precise, e.g. transfer -> gpuDataTransfer aartbik: can we make this a bit more precise, e.g. transfer -> gpuDataTransfer
		*this, "gpu-data-transfer-strategy",
		::llvm::cl::desc(
		"Set the data transfer strategy between the host and the GPUs"),
		::llvm::cl::init(mlir::GPUDataTransferStrategy::kRegularDMA),
		llvm::cl::values(
		clEnumValN(mlir::GPUDataTransferStrategy::kRegularDMA, "regular-dma",
		"Default option: malloc on host without additional "
		"options or care and then use DMA to copy the data"),
		clEnumValN(mlir::GPUDataTransferStrategy::kPinnedDMA, "pinned-dma",
		"Based on the default option, pin the host memory to "
		"accelerate the data transfer"),
		clEnumValN(mlir::GPUDataTransferStrategy::kZeroCopy, "zero-copy",
		"Use zero-copy to perform the data transfer from the host "
		"to the GPU"))};

PassOptions::Option<bool> enableIndexReduction{		PassOptions::Option<bool> enableIndexReduction{
*this, "enable-index-reduction",		*this, "enable-index-reduction",
desc("Enable dependent index reduction based algorithm to handle "		desc("Enable dependent index reduction based algorithm to handle "
"non-trivial index expressions on sparse inputs (experimental "		"non-trivial index expressions on sparse inputs (experimental "
"features)"),		"features)"),
init(false)};		init(false)};

▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	struct SparseCompilerOptions
/// This option is used to enable GPU library generation.		/// This option is used to enable GPU library generation.
PassOptions::Option<bool> enableGPULibgen{		PassOptions::Option<bool> enableGPULibgen{
*this, "enable-gpu-libgen",		*this, "enable-gpu-libgen",
desc("Enables GPU acceleration by means of direct library calls (like "		desc("Enables GPU acceleration by means of direct library calls (like "
"cuSPARSE)")};		"cuSPARSE)")};

/// Projects out the options for `createSparsificationPass`.		/// Projects out the options for `createSparsificationPass`.
SparsificationOptions sparsificationOptions() const {		SparsificationOptions sparsificationOptions() const {
return SparsificationOptions(parallelization, enableIndexReduction,		return SparsificationOptions(parallelization, gpuDataTransfer,
enableGPULibgen, enableRuntimeLibrary);		enableIndexReduction, enableGPULibgen,
		enableRuntimeLibrary);
}		}

/// Projects out the options for `createSparseTensorConversionPass`.		/// Projects out the options for `createSparseTensorConversionPass`.
SparseTensorConversionOptions sparseTensorConversionOptions() const {		SparseTensorConversionOptions sparseTensorConversionOptions() const {
return SparseTensorConversionOptions(		return SparseTensorConversionOptions(
sparseToSparseConversionStrategy(sparseToSparse));		sparseToSparseConversionStrategy(sparseToSparse));
}		}

Show All 32 Lines

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h

Show All 38 Lines	enum class SparseParallelizationStrategy {
kNone,		kNone,
kDenseOuterLoop,		kDenseOuterLoop,
kAnyStorageOuterLoop,		kAnyStorageOuterLoop,
kDenseAnyLoop,		kDenseAnyLoop,
kAnyStorageAnyLoop		kAnyStorageAnyLoop
// TODO: support reduction parallelization too?		// TODO: support reduction parallelization too?
};		};

		// TODO : Zero copy is disabled due to correctness bugs.Tracker #64316
		aartbikUnsubmitted Done Reply Inline Actions add a TODO that ZeroCopy is not supported yet, together with a tracker # that describes the problem so it can be fixed in the future (and then enabled here) aartbik: add a TODO that ZeroCopy is not supported yet, together with a tracker # that describes the…
		enum class GPUDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA };
		aartbikUnsubmitted Done Reply Inline Actions GPUDataTransferStrategy (not necessary Sparse) aartbik: GPUDataTransferStrategy (not necessary Sparse)

#define GEN_PASS_DECL		#define GEN_PASS_DECL
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"		#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"

/// Options for the Sparsification pass.		/// Options for the Sparsification pass.
struct SparsificationOptions {		struct SparsificationOptions {
SparsificationOptions(SparseParallelizationStrategy p, bool idxReduc,		SparsificationOptions(SparseParallelizationStrategy p,
		GPUDataTransferStrategy t, bool idxReduc,
bool gpuLibgen, bool enableRT)		bool gpuLibgen, bool enableRT)
: parallelizationStrategy(p), enableIndexReduction(idxReduc),		: parallelizationStrategy(p), gpuDataTransferStrategy(t),
enableGPULibgen(gpuLibgen), enableRuntimeLibrary(enableRT) {}		enableIndexReduction(idxReduc), enableGPULibgen(gpuLibgen),
		enableRuntimeLibrary(enableRT) {}
		aartbikUnsubmitted Done Reply Inline Actions let's name this gpuDataTransferStrategy aartbik: let's name this gpuDataTransferStrategy
SparsificationOptions()		SparsificationOptions()
: SparsificationOptions(SparseParallelizationStrategy::kNone, false,		: SparsificationOptions(SparseParallelizationStrategy::kNone,
		GPUDataTransferStrategy::kRegularDMA, false,
false, true) {}		false, true) {}
SparseParallelizationStrategy parallelizationStrategy;		SparseParallelizationStrategy parallelizationStrategy;
		GPUDataTransferStrategy gpuDataTransferStrategy;
bool enableIndexReduction;		bool enableIndexReduction;
bool enableGPULibgen;		bool enableGPULibgen;
bool enableRuntimeLibrary;		bool enableRuntimeLibrary;
};		};

/// Sets up sparsification rewriting rules with the given options.		/// Sets up sparsification rewriting rules with the given options.
void populateSparsificationPatterns(		void populateSparsificationPatterns(
RewritePatternSet &patterns,		RewritePatternSet &patterns,
▲ Show 20 Lines • Show All 138 Lines • ▼ Show 20 Lines
std::unique_ptr<Pass> createSparseVectorizationPass();		std::unique_ptr<Pass> createSparseVectorizationPass();
std::unique_ptr<Pass> createSparseVectorizationPass(unsigned vectorLength,		std::unique_ptr<Pass> createSparseVectorizationPass(unsigned vectorLength,
bool enableVLAVectorization,		bool enableVLAVectorization,
bool enableSIMDIndex32);		bool enableSIMDIndex32);

void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,		void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,
unsigned numThreads);		unsigned numThreads);

void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,		void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT,
bool enableRT);		GPUDataTransferStrategy gpuDataTransfer);

std::unique_ptr<Pass> createSparseGPUCodegenPass();		std::unique_ptr<Pass> createSparseGPUCodegenPass();
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);		std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Registration.		// Registration.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Generate the code for registering passes.		/// Generate the code for registering passes.
#define GEN_PASS_REGISTRATION		#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"		#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"

} // namespace mlir		} // namespace mlir

#endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_PASSES_H_		#endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_PASSES_H_

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td

Show First 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	Option<"parallelization", "parallelization-strategy", "mlir::SparseParallelizationStrategy",
"any-storage-outer-loop",		"any-storage-outer-loop",
"Enable sparse parallelization regardless of storage for the outer loop."),		"Enable sparse parallelization regardless of storage for the outer loop."),
clEnumValN(mlir::SparseParallelizationStrategy::kDenseAnyLoop,		clEnumValN(mlir::SparseParallelizationStrategy::kDenseAnyLoop,
"dense-any-loop",		"dense-any-loop",
"Enable dense parallelization for any loop."),		"Enable dense parallelization for any loop."),
clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,		clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",		"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))}]>,		"Enable sparse parallelization for any storage and loop."))}]>,
		Option<"gpuDataTransfer", "gpu-data-transfer-strategy", "mlir::GPUDataTransferStrategy",
		"mlir::GPUDataTransferStrategy::kRegularDMA",
		"Set the data transfer strategy", [{llvm::cl::values(
		clEnumValN(mlir::GPUDataTransferStrategy::kRegularDMA,
		"regular-dma",
		"Default option: malloc on host without additional "
		"options or care and then use DMA to copy the data"),
		clEnumValN(mlir::GPUDataTransferStrategy::kPinnedDMA, "pinned-dma",
		"Based on the default option, pin the host memory to "
		"accelerate the data transfer"),
		clEnumValN(mlir::GPUDataTransferStrategy::kZeroCopy, "zero-copy",
		"Use zero-copy to perform the data transfer from the host "
		"to the GPU"))}]>,
Option<"enableGPULibgen", "enable-gpu-libgen", "bool",		Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
"false",		"false",
"Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,		"Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",		Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
"true", "Enable runtime library for manipulating sparse tensors">,		"true", "Enable runtime library for manipulating sparse tensors">,
];		];
}		}


		K-WuAuthorUnsubmitted Done Reply Inline Actions TODO: unnecessary change: K-Wu: TODO: unnecessary change:
def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp"> {		def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp"> {
let summary = "Applies sparse tensor rewriting rules after sparsification";		let summary = "Applies sparse tensor rewriting rules after sparsification";
let description = [{		let description = [{
A pass that applies rewriting rules to sparse tensor operations after		A pass that applies rewriting rules to sparse tensor operations after
running the actual sparsification pass.		running the actual sparsification pass.
}];		}];
let constructor = "mlir::createPostSparsificationRewritePass()";		let constructor = "mlir::createPostSparsificationRewritePass()";
let dependentDialects = [		let dependentDialects = [
▲ Show 20 Lines • Show All 243 Lines • Show Last 20 Lines

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Show First 20 Lines • Show All 455 Lines • ▼ Show 20 Lines
#endif		#endif
}		}
assert(colA);		assert(colA);
return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,		return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
sz2, nseA, rowA, colA, valA);		sz2, nseA, rowA, colA, valA);
}		}

/// Match and rewrite SpMV kernel.		/// Match and rewrite SpMV kernel.
static LogicalResult rewriteSpMV(PatternRewriter &rewriter,		static LogicalResult
linalg::GenericOp op, bool enableRT) {		rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
		GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value a = op.getOperand(0);		Value a = op.getOperand(0);
Value x = op.getOperand(1);		Value x = op.getOperand(1);
Value y = op.getOperand(2); // we have y = Ax		Value y = op.getOperand(2); // we have y = Ax
SmallVector<Value> tokens;		SmallVector<Value> tokens;

		bool isZeroCopy =
		gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;

// Only admissible sparse matrix format and dense vectors.		// Only admissible sparse matrix format and dense vectors.
bool isCOO = false;		bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);		SparseTensorType aTp = getSparseTensorType(a);
SparseTensorType xTp = getSparseTensorType(x);		SparseTensorType xTp = getSparseTensorType(x);
SparseTensorType yTp = getSparseTensorType(y);		SparseTensorType yTp = getSparseTensorType(y);
if (!areAdmissibleTypes(aTp, xTp, yTp, enableRT, /isMatVec=/true, isCOO))		if (!areAdmissibleTypes(aTp, xTp, yTp, enableRT, /isMatVec=/true, isCOO))
return failure();		return failure();

// Start sparse kernel and copy data from host to device.		// Start sparse kernel and copy data from host to device.
// a : memR/memC/memV -> rowA,colA,valA		// a : memR/memC/memV -> rowA,colA,valA
// x : memX -> vecX		// x : memX -> vecX
// y : memY -> vecY		// y : memY -> vecY
Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);		Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0);		Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1);		Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);		Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);		Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, a);		Value memV = genToValues(rewriter, loc, a);
		Value memX, memY;
		aartbikUnsubmitted Done Reply Inline Actions this change changes the order in all our tests; can't we keep the original order, and to the !regularDMA part later? (bit hard to judge, so this is a soft request, okay to keep if needed) aartbik: this change changes the order in all our tests; can't we keep the original order, and to the !
		Value castR, castC, castV, castX, castY;
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		aartbikUnsubmitted Done Reply Inline Actions keep block together (unnecessary change) or, start new block with comment if you want to have whitespace (but given content above, that makes less sense) aartbik: keep block together (unnecessary change) or, start new block with comment if you want to have…
		memX = genTensorToMemref(rewriter, loc, x);
		aartbikUnsubmitted Done Reply Inline Actions to avoid the underscore and keep naming consistent, can't we just use castR castC castV castX castY aartbik: to avoid the underscore and keep naming consistent, can't we just use castR castC castV castX…
		memY = genTensorToMemref(rewriter, loc, y);
		castR = genHostRegisterMemref(rewriter, loc, memR);
		if (memC)
		castC = genHostRegisterMemref(rewriter, loc, memC);
		castV = genHostRegisterMemref(rewriter, loc, memV);
		castX = genHostRegisterMemref(rewriter, loc, memX);
		castY = genHostRegisterMemref(rewriter, loc, memY);
		}

Value rowA = genAllocCopy(rewriter, loc, memR, tokens);		Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();		Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valA = genAllocCopy(rewriter, loc, memV, tokens);		Value valA = genAllocCopy(rewriter, loc, memV, tokens);
Value memX = genTensorToMemref(rewriter, loc, x);		if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
Value vecX = genAllocCopy(rewriter, loc, memX, tokens);		memX = genTensorToMemref(rewriter, loc, x);
Value memY = genTensorToMemref(rewriter, loc, y);		Value vecX = isZeroCopy ? memX : genAllocCopy(rewriter, loc, memX, tokens);
		if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
		memY = genTensorToMemref(rewriter, loc, y);
Value vecY = genAllocCopy(rewriter, loc, memY, tokens);		Value vecY = genAllocCopy(rewriter, loc, memY, tokens);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
tokens.clear();		tokens.clear();

// Create sparse environment and sparse matrix/dense vector handles.		// Create sparse environment and sparse matrix/dense vector handles.
Type indexTp = rewriter.getIndexType();		Type indexTp = rewriter.getIndexType();
Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();		Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();		Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
Show All 32 Lines	rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,

// Copy data back to host and free all the resoures.		// Copy data back to host and free all the resoures.
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)		token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)		token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)		token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
.getAsyncToken();		.getAsyncToken();
token = genDeallocMemRef(rewriter, loc, rowA, token);		token = genDeallocMemRef(rewriter, loc, rowA, token);
		aartbikUnsubmitted Done Reply Inline Actions keep block together (unnecessary change) aartbik: keep block together (unnecessary change)
if (colA)		if (colA)
token = genDeallocMemRef(rewriter, loc, colA, token);		token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);		token = genDeallocMemRef(rewriter, loc, valA, token);
token = genDeallocMemRef(rewriter, loc, buffer, token);		token = genDeallocMemRef(rewriter, loc, buffer, token);
		if (!isZeroCopy)
token = genDeallocMemRef(rewriter, loc, vecX, token);		token = genDeallocMemRef(rewriter, loc, vecX, token);
token = genCopyMemRef(rewriter, loc, memY, vecY, token);		token = genCopyMemRef(rewriter, loc, memY, vecY, token);
token = genDeallocMemRef(rewriter, loc, vecY, token);		token = genDeallocMemRef(rewriter, loc, vecY, token);
tokens.push_back(token);		tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		aartbikUnsubmitted Done Reply Inline Actions no whiteline aartbik: no whiteline
		genHostUnregisterMemref(rewriter, loc, castR);
		if (memC)
		genHostUnregisterMemref(rewriter, loc, castC);
		genHostUnregisterMemref(rewriter, loc, castV);
		genHostUnregisterMemref(rewriter, loc, castX);
		genHostUnregisterMemref(rewriter, loc, castY);
		}
tokens.clear();		tokens.clear();

// Done.		// Done.
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);		rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);
return success();		return success();
}		}

/// Match and rewrite SpMM kernel.		/// Match and rewrite SpMM kernel.
static LogicalResult rewriteSpMM(PatternRewriter &rewriter,		static LogicalResult
linalg::GenericOp op, bool enableRT) {		rewriteSpMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
		GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value a = op.getOperand(0);		Value a = op.getOperand(0);
Value b = op.getOperand(1);		Value b = op.getOperand(1);
Value c = op.getOperand(2); // we have C = AB		Value c = op.getOperand(2); // we have C = AB
SmallVector<Value> tokens;		SmallVector<Value> tokens;

		bool isZeroCopy =
		gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;

// Only admissible sparse matrix format and dense matrices.		// Only admissible sparse matrix format and dense matrices.
bool isCOO = false;		bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);		SparseTensorType aTp = getSparseTensorType(a);
SparseTensorType bTp = getSparseTensorType(b);		SparseTensorType bTp = getSparseTensorType(b);
SparseTensorType cTp = getSparseTensorType(c);		SparseTensorType cTp = getSparseTensorType(c);
if (!areAdmissibleTypes(aTp, bTp, cTp, enableRT, /isMatVec=/false, isCOO))		if (!areAdmissibleTypes(aTp, bTp, cTp, enableRT, /isMatVec=/false, isCOO))
return failure();		return failure();

// Start sparse kernel and copy data from host to device.		// Start sparse kernel and copy data from host to device.
// a : memR/memC/memV -> rowA,colA,valA		// a : memR/memC/memV -> rowA,colA,valA
// b : bufB -> matA		// b : bufB -> matA
// c : bufC -> matC		// c : bufC -> matC
Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);		Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);		Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);		Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);		Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);		Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);		Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, a);		Value memV = genToValues(rewriter, loc, a);
		Value bufB, bufC;
		Value castR, castC, castV, castB, castBufC;
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		aartbikUnsubmitted Done Reply Inline Actions same, just castR, castC etc. aartbik: same, just castR, castC etc.
		bufB = genTensorToMemref(rewriter, loc, b);
		bufC = genTensorToMemref(rewriter, loc, c);
		castR = genHostRegisterMemref(rewriter, loc, memR);
		if (memC)
		castC = genHostRegisterMemref(rewriter, loc, memC);
		castV = genHostRegisterMemref(rewriter, loc, memV);
		castB = genHostRegisterMemref(rewriter, loc, bufB);
		castBufC = genHostRegisterMemref(rewriter, loc, bufC);
		}

Value rowA = genAllocCopy(rewriter, loc, memR, tokens);		Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();		Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valA = genAllocCopy(rewriter, loc, memV, tokens);		Value valA = genAllocCopy(rewriter, loc, memV, tokens);
Value bufB = genTensorToMemref(rewriter, loc, b);		if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
Value matB = genAllocCopy(rewriter, loc, bufB, tokens);		bufB = genTensorToMemref(rewriter, loc, b);
Value bufC = genTensorToMemref(rewriter, loc, c);		Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens);
		if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
		bufC = genTensorToMemref(rewriter, loc, c);
Value matC = genAllocCopy(rewriter, loc, bufC, tokens);		Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
tokens.clear();		tokens.clear();

// Create sparse environment and sparse matrix/dense matrix handles.		// Create sparse environment and sparse matrix/dense matrix handles.
Type indexTp = rewriter.getIndexType();		Type indexTp = rewriter.getIndexType();
Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();		Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();		Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)		token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
.getAsyncToken();		.getAsyncToken();
token = genDeallocMemRef(rewriter, loc, rowA, token);		token = genDeallocMemRef(rewriter, loc, rowA, token);
if (colA)		if (colA)
token = genDeallocMemRef(rewriter, loc, colA, token);		token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);		token = genDeallocMemRef(rewriter, loc, valA, token);
token = genDeallocMemRef(rewriter, loc, buffer, token);		token = genDeallocMemRef(rewriter, loc, buffer, token);
		if (!isZeroCopy)
token = genDeallocMemRef(rewriter, loc, matB, token);		token = genDeallocMemRef(rewriter, loc, matB, token);
token = genCopyMemRef(rewriter, loc, bufC, matC, token);		token = genCopyMemRef(rewriter, loc, bufC, matC, token);
token = genDeallocMemRef(rewriter, loc, matC, token);		token = genDeallocMemRef(rewriter, loc, matC, token);
tokens.push_back(token);		tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		genHostUnregisterMemref(rewriter, loc, castR);
		if (memC)
		genHostUnregisterMemref(rewriter, loc, castC);
		genHostUnregisterMemref(rewriter, loc, castV);
		genHostUnregisterMemref(rewriter, loc, castB);
		genHostUnregisterMemref(rewriter, loc, castC);
		}
tokens.clear();		tokens.clear();

// Done.		// Done.
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);		rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);
return success();		return success();
}		}

// Match and rewrite 2:4 SpMM kernels.		// Match and rewrite 2:4 SpMM kernels.
static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,		static LogicalResult
linalg::GenericOp op) {		rewrite2To4SpMM(PatternRewriter &rewriter, linalg::GenericOp op,
		GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value A = op.getOperand(0);		Value A = op.getOperand(0);
Value B = op.getOperand(1);		Value B = op.getOperand(1);
Value C = op.getOperand(2); // we have C = AB		Value C = op.getOperand(2); // we have C = AB
SmallVector<Value> tokens;		SmallVector<Value> tokens;

		bool isZeroCopy =
		gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;

// All input should be dense tensors.		// All input should be dense tensors.
if (!isDenseTensor(A) \|\| !isDenseTensor(B) \|\| !isDenseTensor(C))		if (!isDenseTensor(A) \|\| !isDenseTensor(B) \|\| !isDenseTensor(C))
return failure();		return failure();

		Value matA, matB;
Value bufA = genTensorToMemref(rewriter, loc, A);		Value bufA = genTensorToMemref(rewriter, loc, A);
Value matA = genAllocCopy(rewriter, loc, bufA, tokens);		if (!isZeroCopy)
		matA = genAllocCopy(rewriter, loc, bufA, tokens);
Value bufB = genTensorToMemref(rewriter, loc, B);		Value bufB = genTensorToMemref(rewriter, loc, B);
Value matB = genAllocCopy(rewriter, loc, bufB, tokens);		if (!isZeroCopy)
		matB = genAllocCopy(rewriter, loc, bufB, tokens);
Value bufC = genTensorToMemref(rewriter, loc, C);		Value bufC = genTensorToMemref(rewriter, loc, C);
		Value castA, castB, castC;
		aartbikUnsubmitted Done Reply Inline Actions no whiteline aartbik: no whiteline
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		aartbikUnsubmitted Done Reply Inline Actions castA, castB etc. aartbik: castA, castB etc.
		castA = genHostRegisterMemref(rewriter, loc, bufA);
		castB = genHostRegisterMemref(rewriter, loc, bufB);
		castC = genHostRegisterMemref(rewriter, loc, bufC);
		}

		if (isZeroCopy) {
		matA = bufA;
		matB = bufB;
		}
Value matC = genAllocCopy(rewriter, loc, bufC, tokens);		Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
tokens.clear();		tokens.clear();
Value szm = linalg::createOrFoldDimOp(rewriter, loc, matA, 0);		Value szm = linalg::createOrFoldDimOp(rewriter, loc, matA, 0);
Value szk = linalg::createOrFoldDimOp(rewriter, loc, matB, 0);		Value szk = linalg::createOrFoldDimOp(rewriter, loc, matB, 0);
Value szn = linalg::createOrFoldDimOp(rewriter, loc, matC, 1);		Value szn = linalg::createOrFoldDimOp(rewriter, loc, matC, 1);

Type indexTp = rewriter.getIndexType();		Type indexTp = rewriter.getIndexType();
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)		token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
.getAsyncToken();		.getAsyncToken();
SmallVector<Value> newDynamicSizes;		SmallVector<Value> newDynamicSizes;

token = genDeallocMemRef(rewriter, loc, buffer, token);		token = genDeallocMemRef(rewriter, loc, buffer, token);
token = genDeallocMemRef(rewriter, loc, buffer2, token);		token = genDeallocMemRef(rewriter, loc, buffer2, token);
token = genDeallocMemRef(rewriter, loc, buffer3, token);		token = genDeallocMemRef(rewriter, loc, buffer3, token);

		if (!isZeroCopy)
token = genDeallocMemRef(rewriter, loc, matA, token);		token = genDeallocMemRef(rewriter, loc, matA, token);
		if (!isZeroCopy)
token = genDeallocMemRef(rewriter, loc, matB, token);		token = genDeallocMemRef(rewriter, loc, matB, token);
token = genCopyMemRef(rewriter, loc, bufC, matC, token);		token = genCopyMemRef(rewriter, loc, bufC, matC, token);
token = genDeallocMemRef(rewriter, loc, matC, token);		token = genDeallocMemRef(rewriter, loc, matC, token);
tokens.push_back(token);		tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		genHostUnregisterMemref(rewriter, loc, castA);
		genHostUnregisterMemref(rewriter, loc, castB);
		genHostUnregisterMemref(rewriter, loc, castC);
		}
tokens.clear();		tokens.clear();
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);		rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);
return success();		return success();
}		}

/// Match and rewrite SDDMM kernel.		/// Match and rewrite SDDMM kernel.
static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,		static LogicalResult
linalg::GenericOp op, bool enableRT) {		rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
		GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value a = op.getOperand(0);		Value a = op.getOperand(0);
Value b = op.getOperand(1);		Value b = op.getOperand(1);
Value c = op.getOperand(2);		Value c = op.getOperand(2);
SmallVector<Value> tokens;		SmallVector<Value> tokens;

		bool isZeroCopy =
		gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;

// Only admissible sparse matrix format and dense matrices, no COO.		// Only admissible sparse matrix format and dense matrices, no COO.
bool isCOO = false;		bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);		SparseTensorType aTp = getSparseTensorType(a);
SparseTensorType bTp = getSparseTensorType(b);		SparseTensorType bTp = getSparseTensorType(b);
SparseTensorType cTp = getSparseTensorType(c);		SparseTensorType cTp = getSparseTensorType(c);
if (!areAdmissibleTypes(cTp, bTp, aTp, enableRT, false, isCOO))		if (!areAdmissibleTypes(cTp, bTp, aTp, enableRT, false, isCOO))
return failure();		return failure();
if (isCOO)		if (isCOO)
return failure();		return failure();

// The SDDMM does the in-place operation.		// The SDDMM does the in-place operation.
// Start sparse kernel and copy data from host to device.		// Start sparse kernel and copy data from host to device.
// a : bufA -> matA		// a : bufA -> matA
// b : bufB -> matA		// b : bufB -> matA
// c : memR/memC/memV -> rowC,colC,valC		// c : memR/memC/memV -> rowC,colC,valC
Value nseC = rewriter.create<NumberOfEntriesOp>(loc, c);		Value nseC = rewriter.create<NumberOfEntriesOp>(loc, c);
Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);		Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);		Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);		Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
		Value matA, matB;
Value bufA = genTensorToMemref(rewriter, loc, a);		Value bufA = genTensorToMemref(rewriter, loc, a);
Value matA = genAllocCopy(rewriter, loc, bufA, tokens);		if (!isZeroCopy)
		matA = genAllocCopy(rewriter, loc, bufA, tokens);
Value bufB = genTensorToMemref(rewriter, loc, b);		Value bufB = genTensorToMemref(rewriter, loc, b);
Value matB = genAllocCopy(rewriter, loc, bufB, tokens);		if (!isZeroCopy)
		matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens);
Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT);		Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT);		Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, c);		Value memV = genToValues(rewriter, loc, c);

		Value castB, castA, castR, castC, castV;
		aartbikUnsubmitted Done Reply Inline Actions castB, castA aartbik: castB, castA
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		castB = genHostRegisterMemref(rewriter, loc, bufB);
		castA = genHostRegisterMemref(rewriter, loc, bufA);
		castR = genHostRegisterMemref(rewriter, loc, memR);
		if (memC)
		castC = genHostRegisterMemref(rewriter, loc, memC);
		castV = genHostRegisterMemref(rewriter, loc, memV);
		}

		if (isZeroCopy) {
		matA = bufA;
		matB = bufB;
		}
Value rowC = genAllocCopy(rewriter, loc, memR, tokens);		Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();		Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valC = genAllocCopy(rewriter, loc, memV, tokens);		Value valC = genAllocCopy(rewriter, loc, memV, tokens);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
tokens.clear();		tokens.clear();

// Create sparse environment and sparse matrix/dense matrix handles.		// Create sparse environment and sparse matrix/dense matrix handles.
Type indexTp = rewriter.getIndexType();		Type indexTp = rewriter.getIndexType();
Show All 34 Lines	rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
// Copy data back to host and free all the resoures.		// Copy data back to host and free all the resoures.
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnA)		token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnA)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)		token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)		token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
.getAsyncToken();		.getAsyncToken();
token = genDeallocMemRef(rewriter, loc, buffer, token);		token = genDeallocMemRef(rewriter, loc, buffer, token);
		if (!isZeroCopy) {
token = genDeallocMemRef(rewriter, loc, matA, token);		token = genDeallocMemRef(rewriter, loc, matA, token);
token = genDeallocMemRef(rewriter, loc, matB, token);		token = genDeallocMemRef(rewriter, loc, matB, token);
		}
token = genDeallocMemRef(rewriter, loc, rowC, token);		token = genDeallocMemRef(rewriter, loc, rowC, token);
if (colC)		if (colC)
token = genDeallocMemRef(rewriter, loc, colC, token);		token = genDeallocMemRef(rewriter, loc, colC, token);
token = genCopyMemRef(rewriter, loc, memV, valC, token);		token = genCopyMemRef(rewriter, loc, memV, valC, token);
token = genDeallocMemRef(rewriter, loc, valC, token);		token = genDeallocMemRef(rewriter, loc, valC, token);
tokens.push_back(token);		tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
		if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
		genHostUnregisterMemref(rewriter, loc, castB);
		genHostUnregisterMemref(rewriter, loc, castA);
		genHostUnregisterMemref(rewriter, loc, castR);
		if (memC)
		genHostUnregisterMemref(rewriter, loc, castC);
		genHostUnregisterMemref(rewriter, loc, castV);
		}
tokens.clear();		tokens.clear();

// Done.		// Done.
rewriter.replaceOpWithNewOp<sparse_tensor::LoadOp>(op, c);		rewriter.replaceOpWithNewOp<sparse_tensor::LoadOp>(op, c);
return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines
// Rewriting rules for library recognition and code generation.		// Rewriting rules for library recognition and code generation.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Proof-of-concept rewriter. This rule recognizes certain math kernels		/// Proof-of-concept rewriter. This rule recognizes certain math kernels
/// and replaces these with corresponding calls into a sparse library.		/// and replaces these with corresponding calls into a sparse library.
struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {		struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;		using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;

LinalgOpRewriter(MLIRContext *context, bool rt)		LinalgOpRewriter(MLIRContext *context, bool rt, GPUDataTransferStrategy t)
: OpRewritePattern(context), enableRT(rt) {}		: OpRewritePattern(context), enableRT(rt), gpuDataTransferStrategy(t) {}

LogicalResult matchAndRewrite(linalg::GenericOp op,		LogicalResult matchAndRewrite(linalg::GenericOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
if (op.getNumDpsInits() != 1)		if (op.getNumDpsInits() != 1)
return failure(); // reject multi-output		return failure(); // reject multi-output

const unsigned numLoops = op.getNumLoops();		const unsigned numLoops = op.getNumLoops();
const unsigned numTensors = op->getNumOperands();		const unsigned numTensors = op->getNumOperands();
Show All 9 Lines	LogicalResult matchAndRewrite(linalg::GenericOp op,
// TODO: identify alpha and beta and pass them to the CUDA calls		// TODO: identify alpha and beta and pass them to the CUDA calls

// Recognize a SpMV kernel.		// Recognize a SpMV kernel.
if (numLoops == 2 && numTensors == 3 &&		if (numLoops == 2 && numTensors == 3 &&
linalg::isParallelIterator(iteratorTypes[0]) &&		linalg::isParallelIterator(iteratorTypes[0]) &&
linalg::isReductionIterator(iteratorTypes[1]) &&		linalg::isReductionIterator(iteratorTypes[1]) &&
// TODO: add transposed {i, j}		// TODO: add transposed {i, j}
maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) {		maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) {
return rewriteSpMV(rewriter, op, enableRT);		return rewriteSpMV(rewriter, op, enableRT, gpuDataTransferStrategy);
}		}

// Recognize a SpMM kernel.		// Recognize a SpMM kernel.
if (numLoops == 3 && numTensors == 3 &&		if (numLoops == 3 && numTensors == 3 &&
linalg::isParallelIterator(iteratorTypes[0]) &&		linalg::isParallelIterator(iteratorTypes[0]) &&
linalg::isParallelIterator(iteratorTypes[1]) &&		linalg::isParallelIterator(iteratorTypes[1]) &&
linalg::isReductionIterator(iteratorTypes[2]) &&		linalg::isReductionIterator(iteratorTypes[2]) &&
// TODO: add transposed {i, k}, {k, j}		// TODO: add transposed {i, k}, {k, j}
// TODO: maybe add transposed {i, j} in future		// TODO: maybe add transposed {i, j} in future
maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) {		maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) {
if (op->getAttr("DENSE24"))		if (op->getAttr("DENSE24"))
return rewrite2To4SpMM(rewriter, op);		return rewrite2To4SpMM(rewriter, op, gpuDataTransferStrategy);

return rewriteSpMM(rewriter, op, enableRT);		return rewriteSpMM(rewriter, op, enableRT, gpuDataTransferStrategy);
}		}

// Recognize a SDDMM kernel.		// Recognize a SDDMM kernel.
if (numLoops == 3 && numTensors == 3 &&		if (numLoops == 3 && numTensors == 3 &&
linalg::isParallelIterator(iteratorTypes[0]) &&		linalg::isParallelIterator(iteratorTypes[0]) &&
linalg::isParallelIterator(iteratorTypes[1]) &&		linalg::isParallelIterator(iteratorTypes[1]) &&
linalg::isReductionIterator(iteratorTypes[2]) &&		linalg::isReductionIterator(iteratorTypes[2]) &&
// TODO: add transposed {i, k}, {k, j}		// TODO: add transposed {i, k}, {k, j}
// TODO: maybe add transposed {i, j} in future		// TODO: maybe add transposed {i, j} in future
maps == infer({{i, k}, {k, j}, {i, j}}) &&		maps == infer({{i, k}, {k, j}, {i, j}}) &&
matchSumReductionOfMulUnary(op)) {		matchSumReductionOfMulUnary(op)) {
return rewriteSDDMM(rewriter, op, enableRT);		return rewriteSDDMM(rewriter, op, enableRT, gpuDataTransferStrategy);
}		}

return failure();		return failure();
}		}

private:		private:
bool enableRT;		bool enableRT;
		GPUDataTransferStrategy gpuDataTransferStrategy;
};		};

} // namespace		} // namespace

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Public method for populating GPU rewriting rules.		// Public method for populating GPU rewriting rules.
//		//
// Currently two set of rewriting rules are made available. The first set		// Currently two set of rewriting rules are made available. The first set
// implements direct code generation, currently by means of convering the		// implements direct code generation, currently by means of convering the
// outermost paralell loop into GPU threads. The second set implements		// outermost paralell loop into GPU threads. The second set implements
// libary recognition of a set of sparse operations. Eventually, the right		// libary recognition of a set of sparse operations. Eventually, the right
// combination of these two approaches has to be found.		// combination of these two approaches has to be found.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void mlir::populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,		void mlir::populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,
unsigned numThreads) {		unsigned numThreads) {
patterns.add<ForallRewriter>(patterns.getContext(), numThreads);		patterns.add<ForallRewriter>(patterns.getContext(), numThreads);
}		}

void mlir::populateSparseGPULibgenPatterns(RewritePatternSet &patterns,		void mlir::populateSparseGPULibgenPatterns(
bool enableRT) {		RewritePatternSet &patterns, bool enableRT,
patterns.add<LinalgOpRewriter>(patterns.getContext(), enableRT);		GPUDataTransferStrategy gpuDataTransfer) {
		patterns.add<LinalgOpRewriter>(patterns.getContext(), enableRT,
		gpuDataTransfer);
}		}

mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp

	Show First 20 Lines • Show All 59 Lines • ▼ Show 20 Lines

	struct SparsificationPass			struct SparsificationPass
	: public impl::SparsificationPassBase<SparsificationPass> {			: public impl::SparsificationPassBase<SparsificationPass> {

	SparsificationPass() = default;			SparsificationPass() = default;
	SparsificationPass(const SparsificationPass &pass) = default;			SparsificationPass(const SparsificationPass &pass) = default;
	SparsificationPass(const SparsificationOptions &options) {			SparsificationPass(const SparsificationOptions &options) {
	parallelization = options.parallelizationStrategy;			parallelization = options.parallelizationStrategy;
				gpuDataTransfer = options.gpuDataTransferStrategy;
	enableIndexReduction = options.enableIndexReduction;			enableIndexReduction = options.enableIndexReduction;
	enableGPULibgen = options.enableGPULibgen;			enableGPULibgen = options.enableGPULibgen;
	enableRuntimeLibrary = options.enableRuntimeLibrary;			enableRuntimeLibrary = options.enableRuntimeLibrary;
	}			}

	void runOnOperation() override {			void runOnOperation() override {
	auto *ctx = &getContext();			auto *ctx = &getContext();
	// Translate strategy flags to strategy options.			// Translate strategy flags to strategy options.
	SparsificationOptions options(parallelization, enableIndexReduction,			SparsificationOptions options(parallelization, gpuDataTransfer,
	enableGPULibgen, enableRuntimeLibrary);			enableIndexReduction, enableGPULibgen,
				enableRuntimeLibrary);
	// Apply GPU libgen (if requested), sparsification, and cleanup rewriting.			// Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
	RewritePatternSet patterns(ctx);			RewritePatternSet patterns(ctx);
	if (enableGPULibgen) {			if (enableGPULibgen) {
	populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);			// TODO : Zero copy is disabled due to correctness bugs.Tracker #64316
				assert(gpuDataTransfer != GPUDataTransferStrategy::kZeroCopy &&
				"zero-copy transfer not supported with GPU libgen");
				populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary,
				gpuDataTransfer);
				aartbikUnsubmitted Done Reply Inline Actions add tracker # aartbik: add tracker #
	}			}
	populateSparsificationPatterns(patterns, options);			populateSparsificationPatterns(patterns, options);
	scf::ForOp::getCanonicalizationPatterns(patterns, ctx);			scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
	(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));			(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
	}			}
	};			};

	struct PostSparsificationRewritePass			struct PostSparsificationRewritePass
	▲ Show 20 Lines • Show All 353 Lines • Show Last 20 Lines

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir

	//			//
	// NOTE: this test requires gpu-sm80 and cusparselt			// NOTE: this test requires gpu-sm80 and cusparselt
	//			//
	// RUN: mlir-opt %s \			// DEFINE: %{compile} = mlir-opt %s \
	// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
	// RUN: \| mlir-cpu-runner \			// DEFINE: %{run} = mlir-cpu-runner \
	// RUN: --shared-libs=%mlir_cuda_runtime \			// DEFINE: --shared-libs=%mlir_cuda_runtime \
	// RUN: --shared-libs=%mlir_c_runner_utils \			// DEFINE: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e main --entry-point-result=void \			// DEFINE: --e main --entry-point-result=void \
	// RUN: \| FileCheck %s			// DEFINE: \| FileCheck %s

				// RUN: %{compile}" \| %{run}
				// RUN: %{compile} gpu-data-transfer-strategy=pinned-dma" \| %{run}
				// Tracker #64316
				aartbikUnsubmitted Done Reply Inline Actions add tracker # aartbik: add tracker #
				// RUNNOT: %{compile} gpu-data-transfer-strategy=zero-copy" \| %{run}

	#map = affine_map<(d0, d1, d2) -> (d0, d2)>			#map = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>			#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>			#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>

	module {			module {
	llvm.func @mgpuCreateSparseLtEnv()			llvm.func @mgpuCreateSparseLtEnv()
	llvm.func @mgpuDestroySparseLtEnv()			llvm.func @mgpuDestroySparseLtEnv()
	▲ Show 20 Lines • Show All 179 Lines • Show Last 20 Lines

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir

	//			//
	// NOTE: this test requires gpu-sm80 and cusparselt			// NOTE: this test requires gpu-sm80 and cusparselt
	//			//
	// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \			// DEFINE: %{compile} = mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
	// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \			// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
	// RUN: %s \			// DEFINE: %s
	// RUN: \| mlir-cpu-runner \			// DEFINE: %{run} = mlir-cpu-runner \
	// RUN: --shared-libs=%mlir_cuda_runtime \			// DEFINE: --shared-libs=%mlir_cuda_runtime \
	// RUN: --shared-libs=%mlir_c_runner_utils \			// DEFINE: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e main --entry-point-result=void \			// DEFINE: --e main --entry-point-result=void \
	// RUN: \| FileCheck %s			// DEFINE: \| FileCheck %s

				// RUN: %{compile} \| %{run}
				aartbikUnsubmitted Done Reply Inline Actions the other RUN and NOTRUN? aartbik: the other RUN and NOTRUN?
				// RUN: %{compile} --sparse-compiler="gpu-data-transfer-strategy=pinned-dma" \| %{run}
				// RUNNOT: %{compile} --sparse-compiler="gpu-data-transfer-strategy=zero-copy" \| %{run}


	module {			module {
	llvm.func @mgpuCreateSparseLtEnv()			llvm.func @mgpuCreateSparseLtEnv()
	llvm.func @mgpuDestroySparseLtEnv()			llvm.func @mgpuDestroySparseLtEnv()

	func.func @sampled_matmul(%a : memref<16x32xf16>,			func.func @sampled_matmul(%a : memref<16x32xf16>,
	%b : memref<32x16xf16>,			%b : memref<32x16xf16>,
	%c : memref<16x16xf16>) {			%c : memref<16x16xf16>) {
	▲ Show 20 Lines • Show All 197 Lines • Show Last 20 Lines

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir

	//			//
	// NOTE: this test requires gpu-sm80			// NOTE: this test requires gpu-sm80
	//			//
				// DEFINE: %{compile} = mlir-opt %s \
				// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
				// DEFINE: %{run} = mlir-cpu-runner \
				// DEFINE: --shared-libs=%mlir_cuda_runtime \
				// DEFINE: --shared-libs=%mlir_c_runner_utils \
				// DEFINE: --e main --entry-point-result=void \
				// DEFINE: \| FileCheck %s
				//
				//
	// with RT lib (SoA COO):			// with RT lib (SoA COO):
	//			//
	// RUN: mlir-opt %s \			// RUN: %{compile} enable-runtime-library=true" \| %{run}
	// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// RUN: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=pinned-dma" \| %{run}
	// RUN: \| mlir-cpu-runner \			// Tracker #64316
				aartbikUnsubmitted Done Reply Inline Actions tracker aartbik: tracker
	// RUN: --shared-libs=%mlir_cuda_runtime \			// RUNNOT: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=zero-copy" \| %{run}
	// RUN: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e main --entry-point-result=void \
	// RUN: \| FileCheck %s
	//			//
	// without RT lib (AoS COO): note, may fall back to CPU			// without RT lib (AoS COO): note, may fall back to CPU
	//			//
	// RUN: mlir-opt %s \			// RUN: %{compile} enable-runtime-library=false" \| %{run}
	// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// RUN: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=pinned-dma" \| %{run}
	// RUN: \| mlir-cpu-runner \			// Tracker #64316
	// RUN: --shared-libs=%mlir_cuda_runtime \			// RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" \| %{run}
	// RUN: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e main --entry-point-result=void \
	// RUN: \| FileCheck %s

	#SortedCOO = #sparse_tensor.encoding<{			#SortedCOO = #sparse_tensor.encoding<{
	lvlTypes = [ "compressed-nu", "singleton" ]			lvlTypes = [ "compressed-nu", "singleton" ]
	}>			}>

	#CSR = #sparse_tensor.encoding<{			#CSR = #sparse_tensor.encoding<{
	lvlTypes = [ "dense", "compressed" ],			lvlTypes = [ "dense", "compressed" ],
	posWidth = 32,			posWidth = 32,
	▲ Show 20 Lines • Show All 154 Lines • Show Last 20 Lines

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir

	//			//
	// NOTE: this test requires gpu-sm80			// NOTE: this test requires gpu-sm80
	//			//
				// DEFINE: %{compile} = mlir-opt %s \
				// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
				// DEFINE: %{run} = mlir-cpu-runner \
				// DEFINE: --shared-libs=%mlir_cuda_runtime \
				// DEFINE: --shared-libs=%mlir_c_runner_utils \
				// DEFINE: --e main --entry-point-result=void \
				// DEFINE: \| FileCheck %s
				//
	// with RT lib (SoA COO):			// with RT lib (SoA COO):
	//			//
	// RUN: mlir-opt %s \			// RUN: %{compile} enable-runtime-library=true" \| %{run}
	// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// RUN: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=pinned-dma" \| %{run}
	// RUN: \| mlir-cpu-runner \			// Tracker #64316
				aartbikUnsubmitted Done Reply Inline Actions tracker aartbik: tracker
	// RUN: --shared-libs=%mlir_cuda_runtime \			// RUNNOT: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=zero-copy" \| %{run}
	// RUN: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e main --entry-point-result=void \
	// RUN: \| FileCheck %s
	//			//
	// without RT lib (AoS COO): note, may fall back to CPU			// without RT lib (AoS COO): note, may fall back to CPU
	//			//
	// RUN: mlir-opt %s \			// RUN: %{compile} enable-runtime-library=false" \| %{run}
	// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// RUN: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=pinned-dma" \| %{run}
	// RUN: \| mlir-cpu-runner \			// Tracker #64316
	// RUN: --shared-libs=%mlir_cuda_runtime \			// RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" \| %{run}
	// RUN: --shared-libs=%mlir_c_runner_utils \			//
	// RUN: --e main --entry-point-result=void \
	// RUN: \| FileCheck %s

	#SortedCOO = #sparse_tensor.encoding<{			#SortedCOO = #sparse_tensor.encoding<{
	lvlTypes = [ "compressed-nu", "singleton" ]			lvlTypes = [ "compressed-nu", "singleton" ]
	}>			}>

	#CSR = #sparse_tensor.encoding<{			#CSR = #sparse_tensor.encoding<{
	lvlTypes = [ "dense", "compressed" ],			lvlTypes = [ "dense", "compressed" ],
	posWidth = 32,			posWidth = 32,
	▲ Show 20 Lines • Show All 103 Lines • Show Last 20 Lines

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir

	//			//
	// NOTE: this test requires gpu-sm80			// NOTE: this test requires gpu-sm80
	//			//
				// DEFINE: %{compile} = mlir-opt %s \
				// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
				// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
				// DEFINE: mlir-cpu-runner \
				// DEFINE: --shared-libs=%mlir_cuda_runtime \
				// DEFINE: --shared-libs=%mlir_c_runner_utils \
				// DEFINE: --e entry --entry-point-result=void \
				// DEFINE: \| FileCheck %s
				//
	// with RT lib:			// with RT lib:
	//			//
	// RUN: mlir-opt %s \			// RUN: %{compile} enable-runtime-library=true" \| %{run}
	// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// RUN: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=pinned-dma" \| %{run}
	// RUN: \| TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \			// Tracker #64316
				aartbikUnsubmitted Done Reply Inline Actions tracker aartbik: tracker
	// RUN: mlir-cpu-runner \			// RUNNOT: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=zero-copy" \| %{run}
	// RUN: --shared-libs=%mlir_cuda_runtime \
	// RUN: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e entry --entry-point-result=void \
	// RUN: \| FileCheck %s
	//			//
	// without RT lib:			// without RT lib:
	//			//
	// RUN: mlir-opt %s \			// RUN: %{compile} enable-runtime-library=false" \| %{run}
	// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \			// RUN: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=pinned-dma" \| %{run}
	// RUN: \| TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \			// Tracker #64316
	// RUN: mlir-cpu-runner \			// RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" \| %{run}
	// RUN: --shared-libs=%mlir_cuda_runtime \
	// RUN: --shared-libs=%mlir_c_runner_utils \
	// RUN: --e entry --entry-point-result=void \
	// RUN: \| FileCheck %s
	//			//

	!Filename = !llvm.ptr<i8>			!Filename = !llvm.ptr<i8>

	#CSR = #sparse_tensor.encoding<{			#CSR = #sparse_tensor.encoding<{
	lvlTypes = ["dense", "compressed"]			lvlTypes = ["dense", "compressed"]
	}>			}>

	#trait_sampled_dense_dense = {			#trait_sampled_dense_dense = {
	▲ Show 20 Lines • Show All 126 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse][gpu] introduce flag that controls host to device copy strategies (regular dma default)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 546245

mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse][gpu] introduce flag that controls host to device copy strategies (regular dma default)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 546245

mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h

mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir

mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir

[mlir][sparse][gpu] introduce flag that controls host to device copy strategies (regular dma default)
ClosedPublic