Diff 500779

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	let description = [{

Thread blocks (aka work-group) are grouped into a grid where grid may be		Thread blocks (aka work-group) are grouped into a grid where grid may be
described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates		described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates
that thread block parallelism is desired. It can be consumed by lowering to		that thread block parallelism is desired. It can be consumed by lowering to
generate GPU code.		generate GPU code.
}];		}];
}		}


		def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
		DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
		let parameters = (ins
		EnumParameter<GPU_AddressSpaceEnum>:$address_space
		);
		let assemblyFormat = "`<` params `>`";
		let description = [{
		An attribute that allows defining memory hierarchy for GPU devices.

		GPU Memory has three memory space, global, workgroup, and private. The global memory
		is visible to all workitems and workgroups, the workgroup memory is only available for workitems
		ftynseUnsubmitted Done Reply Inline Actions Nit: we tend to prefer OpenCL-style terminology, so s/shared/workgroup (which is the actual name of the attribute). s/thread/workitem, s/block/workgroup. ftynse: Nit: we tend to prefer OpenCL-style terminology, so s/shared/workgroup (which is the actual…
		within a workgroup, and private memory is only visible to a single workitem. This attribute indicates
		that using memory hiearchy is desired. It can be consumed by lowering to
		move data to a specific address space in GPU code.
		}];
		}


#endif // GPU_DEVICE_MAPPING_ATTR		#endif // GPU_DEVICE_MAPPING_ATTR

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

Show First 20 Lines • Show All 759 Lines • ▼ Show 20 Lines	::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformState &state);		::mlir::transform::TransformState &state);
}];		}];
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// PromoteOp		// PromoteOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//


def PromoteOp : Op<Transform_Dialect, "structured.promote",		def PromoteOp : Op<Transform_Dialect, "structured.promote",
[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,		[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
TransformOpInterface, TransformEachOpTrait]> {		TransformOpInterface, TransformEachOpTrait]> {
let description = [{		let description = [{
Promotes the specified operands of the target into a separate memory buffer.		Promotes the specified operands of the target into a separate memory buffer.

At this point, this transform does not allow customizing alloc/dealloc		At this point, this transform does not allow customizing alloc/dealloc
functions nor the behavior on copy in/out operations.		functions nor the behavior on copy in/out operations.
Show All 10 Lines	let description = [{
was modified inplace.		was modified inplace.
}];		}];

let arguments = (ins PDL_Operation:$target,		let arguments = (ins PDL_Operation:$target,
DefaultValuedAttr<I64ArrayAttr, "{}">:$operands_to_promote,		DefaultValuedAttr<I64ArrayAttr, "{}">:$operands_to_promote,
DefaultValuedAttr<BoolArrayAttr, "{}">:$use_full_tile_buffers,		DefaultValuedAttr<BoolArrayAttr, "{}">:$use_full_tile_buffers,
UnitAttr:$use_full_tiles_by_default,		UnitAttr:$use_full_tiles_by_default,
UnitAttr:$use_alloca,		UnitAttr:$use_alloca,
		OptionalAttr<DeviceMappingArrayAttr>:$mapping,
OptionalAttr<I64Attr>:$alignment);		OptionalAttr<I64Attr>:$alignment);
let results = (outs PDL_Operation:$transformed);		let results = (outs PDL_Operation:$transformed);

let assemblyFormat = "$target attr-dict";		let assemblyFormat = "$target attr-dict";

let extraClassDeclaration = [{		let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(		::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::linalg::LinalgOp target,		::mlir::linalg::LinalgOp target,
▲ Show 20 Lines • Show All 1,023 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

	Show First 20 Lines • Show All 387 Lines • ▼ Show 20 Lines
	/// 2. Take a full view on the buffer.			/// 2. Take a full view on the buffer.
	/// 3. Take a partial slice of the full view in step 2. and copy into it.			/// 3. Take a partial slice of the full view in step 2. and copy into it.
	///			///
	/// Return the modified linalg op (the modification happens in place) as well			/// Return the modified linalg op (the modification happens in place) as well
	/// as all the copy ops created.			/// as all the copy ops created.
	FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,			FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
	const LinalgPromotionOptions &options);			const LinalgPromotionOptions &options);

				/// Allocate the subview in the GPU workgroup memory.
				ftynseUnsubmitted Done Reply Inline Actions Nit: top-level comments in MLIR should be prefixed with `///`. Also, terminate sentences in documentation with a period. ftynse: Nit: top-level comments in MLIR should be prefixed with `///`. Also, terminate sentences in…
				Optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
				memref::SubViewOp subview,
				ArrayRef<Value> sizeBounds,
				DataLayout &);

				/// In case of GPU group memory there is no need to deallocate.
				LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /buffer/);

				/// Create Memref copy operations and add gpu barrier guards before and after
				/// the copy operation to ensure data integrity.
				LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);

				/// Allocate the subview in the GPU private memory.
				Optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
				memref::SubViewOp subview,
				ArrayRef<Value> sizeBounds,
				DataLayout &);

				/// Normal copy to between src and dst.
				LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst);

				/// In case of GPU private memory there is no need to deallocate since the
				/// memory is freed when going outside of the scope.
				LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /buffer/);

	/// Emit a suitable vector form for a Linalg op. If provided, `inputVectorSizes`			/// Emit a suitable vector form for a Linalg op. If provided, `inputVectorSizes`
	/// are used to vectorize this operation. `inputVectorSizes` must match the rank			/// are used to vectorize this operation. `inputVectorSizes` must match the rank
	/// of the iteration space of the operation and the sizes must be smaller or			/// of the iteration space of the operation and the sizes must be smaller or
	/// equal than their counterpart interation space sizes, if static.			/// equal than their counterpart interation space sizes, if static.
	/// `inputVectorShapes` also allows the vectorization of operations with dynamic			/// `inputVectorShapes` also allows the vectorization of operations with dynamic
	/// shapes.			/// shapes.
	LogicalResult vectorize(RewriterBase &rewriter, LinalgOp linalgOp,			LogicalResult vectorize(RewriterBase &rewriter, LinalgOp linalgOp,
	ArrayRef<int64_t> inputVectorSizes = {},			ArrayRef<int64_t> inputVectorSizes = {},
	▲ Show 20 Lines • Show All 882 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

	Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
	int64_t GPUWarpMappingAttr::getMappingId() const {			int64_t GPUWarpMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getWarp());			return static_cast<int64_t>(getWarp());
	}			}

	int64_t GPUThreadMappingAttr::getMappingId() const {			int64_t GPUThreadMappingAttr::getMappingId() const {
	return static_cast<int64_t>(getThread());			return static_cast<int64_t>(getThread());
	}			}

				int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
				return static_cast<int64_t>(getAddressSpace());
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// MMAMatrixType			// MMAMatrixType
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	MMAMatrixType MMAMatrixType::get(ArrayRef<int64_t> shape, Type elementType,			MMAMatrixType MMAMatrixType::get(ArrayRef<int64_t> shape, Type elementType,
	StringRef operand) {			StringRef operand) {
	return Base::get(elementType.getContext(), shape, elementType, operand);			return Base::get(elementType.getContext(), shape, elementType, operand);
	}			}
	▲ Show 20 Lines • Show All 1,387 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

Show First 20 Lines • Show All 1,775 Lines • ▼ Show 20 Lines	if (!std::is_permutation(sequence.begin(), sequence.end(),
<< attr;		<< attr;
}		}
}		}
return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// PromoteOp		// PromoteOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		ftynseUnsubmitted Done Reply Inline Actions Functions that are only necessary in this translation unit should be declared `static`. Top-level entities such as function must be documented. I would also consider placing that somewhere in Linalg/Transform/ rather than in TransformOps. ftynse: Functions that are only necessary in this translation unit should be declared `static`. Top…

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
transform::PromoteOp::applyToOne(LinalgOp target,		transform::PromoteOp::applyToOne(LinalgOp target,
transform::ApplyToEachResultList &results,		transform::ApplyToEachResultList &results,
transform::TransformState &state) {		transform::TransformState &state) {
LinalgPromotionOptions promotionOptions;		LinalgPromotionOptions promotionOptions;
if (!getOperandsToPromote().empty())		if (!getOperandsToPromote().empty())
promotionOptions = promotionOptions.setOperandsToPromote(		promotionOptions = promotionOptions.setOperandsToPromote(
extractFromI64ArrayAttr(getOperandsToPromote()));		extractFromI64ArrayAttr(getOperandsToPromote()));
if (getUseFullTilesByDefault())		if (getUseFullTilesByDefault())
promotionOptions = promotionOptions.setUseFullTileBuffersByDefault(		promotionOptions = promotionOptions.setUseFullTileBuffersByDefault(
getUseFullTilesByDefault());		getUseFullTilesByDefault());
		ftynseUnsubmitted Done Reply Inline Actions Nit: don't specify the number of stack elements for the vector. ftynse: Nit: don't specify the number of stack elements for the vector.
if (getUseAlloca())		if (getUseAlloca())
promotionOptions = promotionOptions.setUseAlloca(getUseAlloca());		promotionOptions = promotionOptions.setUseAlloca(getUseAlloca());
if (!getUseFullTileBuffers().empty())		if (!getUseFullTileBuffers().empty())
promotionOptions = promotionOptions.setUseFullTileBuffers(		promotionOptions = promotionOptions.setUseFullTileBuffers(
llvm::to_vector(getUseFullTileBuffers().getAsValueRange<BoolAttr>()));		llvm::to_vector(getUseFullTileBuffers().getAsValueRange<BoolAttr>()));
if (getAlignment().has_value())		if (getAlignment().has_value())
promotionOptions = promotionOptions.setAlignment(*getAlignment());		promotionOptions = promotionOptions.setAlignment(*getAlignment());

		ftynseUnsubmitted Done Reply Inline Actions It's a good practice to use `OpBuilder:InsertionGuard` to restore the location before the function returns. ftynse: It's a good practice to use `OpBuilder:InsertionGuard` to restore the location before the…
		tavakkoliamirmohammadAuthorUnsubmitted Done Reply Inline Actions This is done in line 1788. Is there anything needed to reset the insertion point? since by destroying the `OpBuilder::InsertionGuard guard` the insertion point is restored tavakkoliamirmohammad: This is done in line 1788. Is there anything needed to reset the insertion point? since by…
		ftynseUnsubmitted Done Reply Inline Actions We don't see this line in the review. Please reupload the patch with more context `git diff -U99999` or use Arcanist https://llvm.org/docs/Phabricator.html. ftynse: We don't see this line in the review. Please reupload the patch with more context `git diff…
		if (getMapping().has_value()) {
		// The mapping should only contain an element
		auto mapping = *getMapping();
		if (mapping.size() > 1)
		ftynseUnsubmitted Done Reply Inline Actions Nit: MLIR uses camelCase for variable names. ftynse: Nit: MLIR uses camelCase for variable names.
		return emitDefaultDefiniteFailure(target);

		auto addressSpace = mapping[0].cast<gpu::GPUMemorySpaceMappingAttr>();

		ftynseUnsubmitted Done Reply Inline Actions Nit: any chance of adding a temporary to make this more readable. ... mapping = getMapping()->getValue() auto address_space = mapping.take_front()... I also don't understand why this needs access to raw `.data()` ftynse: Nit: any chance of adding a temporary to make this more readable. ``` ... mapping = getMapping…
		tavakkoliamirmohammadAuthorUnsubmitted Done Reply Inline Actions I wanted to get the first element of the mapping so I have to through the chain from mlir:ArrayAttr to llvm::ArrayRef to the actual data. I thought this be more readable but it seems the new approach is more readable. tavakkoliamirmohammad: I wanted to get the first element of the mapping so I have to through the chain from mlir…
		if (addressSpace.getAddressSpace() ==
		gpu::GPUDialect::getWorkgroupAddressSpace()) {
		promotionOptions =
		ftynseUnsubmitted Done Reply Inline Actions It is less expensive to query the details of the attribute than to construct a new one, the latter needs to take a lock. Something like `address_space.getAddressSpace() == gpu::GPUDialect::getWorkgroupAddressSpace()` is preferable here. ftynse: It is less expensive to query the details of the attribute than to construct a new one, the…
		promotionOptions
		.setAllocationDeallocationFns(allocateWorkgroupMemory,
		deallocateWorkgroupMemory)
		.setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory)
		.setUseFullTileBuffers({false, false});
		} else if (addressSpace.getAddressSpace() ==
		ftynseUnsubmitted Done Reply Inline Actions We don't commit commented-out code. ftynse: We don't commit commented-out code.
		gpu::GPUDialect::getPrivateAddressSpace()) {
		promotionOptions =
		promotionOptions
		.setAllocationDeallocationFns(allocateGPUPrivateMemory,
		deallocateGPUPrivateMemory)
		.setCopyInOutFns(copyToGPUPrivateMemory, copyToGPUPrivateMemory)
		.setUseFullTileBuffers({false, false});
		} else {
		return emitDefaultDefiniteFailure(target);
		}
		}

if (failed(promoteSubviewsPrecondition(target, promotionOptions)))		if (failed(promoteSubviewsPrecondition(target, promotionOptions)))
return emitDefaultDefiniteFailure(target);		return emitDefaultDefiniteFailure(target);

TrivialPatternRewriter rewriter(target->getContext());		TrivialPatternRewriter rewriter(target->getContext());
rewriter.setInsertionPoint(target);		rewriter.setInsertionPoint(target);
FailureOr<LinalgOp> res = promoteSubViews(rewriter, target, promotionOptions);		FailureOr<LinalgOp> res = promoteSubViews(rewriter, target, promotionOptions);
if (failed(res))		if (failed(res))
return emitDefaultDefiniteFailure(target);		return emitDefaultDefiniteFailure(target);
results.push_back(target);		results.push_back(target);
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ReplaceOp		// ReplaceOp
		ftynseUnsubmitted Done Reply Inline Actions This function only differs from the one above by not using the shared address space here. This is something that can be generalized. We also have an explicit address space for private memory on GPUs. Likely, it should be used here instead of the default address space. ftynse: This function only differs from the one above by not using the shared address space here. This…
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
transform::ReplaceOp::apply(TransformResults &transformResults,		transform::ReplaceOp::apply(TransformResults &transformResults,
TransformState &state) {		TransformState &state) {
ArrayRef<Operation *> payload = state.getPayloadOps(getTarget());		ArrayRef<Operation *> payload = state.getPayloadOps(getTarget());

// Check for invalid targets.		// Check for invalid targets.
Show All 16 Lines	for (Operation *target : payload) {
rewriter.setInsertionPoint(target);		rewriter.setInsertionPoint(target);
Operation replacement = rewriter.clone(pattern);		Operation replacement = rewriter.clone(pattern);
rewriter.replaceOp(target, replacement->getResults());		rewriter.replaceOp(target, replacement->getResults());
replacements.push_back(replacement);		replacements.push_back(replacement);
}		}
transformResults.set(getReplacement().cast<OpResult>(), replacements);		transformResults.set(getReplacement().cast<OpResult>(), replacements);
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

void transform::ReplaceOp::getEffects(		void transform::ReplaceOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {		SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
consumesHandle(getTarget(), effects);		consumesHandle(getTarget(), effects);
producesHandle(getReplacement(), effects);		producesHandle(getReplacement(), effects);
modifiesPayload(effects);		modifiesPayload(effects);
}		}

LogicalResult transform::ReplaceOp::verify() {		LogicalResult transform::ReplaceOp::verify() {
if (!getBodyRegion().hasOneBlock())		if (!getBodyRegion().hasOneBlock())
return emitOpError() << "expected one block";		return emitOpError() << "expected one block";
if (std::distance(getBodyRegion().front().begin(),		if (std::distance(getBodyRegion().front().begin(),
getBodyRegion().front().end()) != 1)		getBodyRegion().front().end()) != 1)
return emitOpError() << "expected one operation in block";		return emitOpError() << "expected one operation in block";
Operation *replacement = &getBodyRegion().front().front();		Operation *replacement = &getBodyRegion().front().front();
		ftynseUnsubmitted Done Reply Inline Actions This would do strange things if both attributes are set. Have you considered having a single attribute that specifies which kind of memory to use? ftynse: This would do strange things if both attributes are set. Have you considered having a single…
if (replacement->getNumOperands() > 0)		if (replacement->getNumOperands() > 0)
return replacement->emitOpError()		return replacement->emitOpError()
<< "expected replacement without operands";		<< "expected replacement without operands";
if (!replacement->hasTrait<OpTrait::IsIsolatedFromAbove>() &&		if (!replacement->hasTrait<OpTrait::IsIsolatedFromAbove>() &&
replacement->getNumRegions() > 0)		replacement->getNumRegions() > 0)
return replacement->emitOpError()		return replacement->emitOpError()
<< "expect op that is isolated from above";		<< "expect op that is isolated from above";
return success();		return success();
▲ Show 20 Lines • Show All 1,275 Lines • Show Last 20 Lines

mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp

//===- Promotion.cpp - Implementation of linalg Promotion -----------------===//		//===- Promotion.cpp - Implementation of linalg Promotion -----------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements the linalg dialect Promotion pass.		// This file implements the linalg dialect Promotion pass.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Arith/IR/Arith.h"		#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"		#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/Complex/IR/Complex.h"		#include "mlir/Dialect/Complex/IR/Complex.h"
		#include "mlir/Dialect/Func/IR/FuncOps.h"
		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"		#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Passes.h"		#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"		#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"		#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/AffineExpr.h"		#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineExprVisitor.h"		#include "mlir/IR/AffineExprVisitor.h"
#include "mlir/IR/AffineMap.h"		#include "mlir/IR/AffineMap.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"		#include "mlir/IR/ImplicitLocOpBuilder.h"
▲ Show 20 Lines • Show All 368 Lines • ▼ Show 20 Lines	mlir::linalg::promoteSubViews(OpBuilder &builder, LinalgOp linalgOp,
LinalgOpInstancePromotionOptions linalgOptions(linalgOp, options);		LinalgOpInstancePromotionOptions linalgOptions(linalgOp, options);
auto layout = DataLayout::closest(linalgOp);		auto layout = DataLayout::closest(linalgOp);
ImplicitLocOpBuilder b(linalgOp.getLoc(), builder);		ImplicitLocOpBuilder b(linalgOp.getLoc(), builder);
auto res = ::promoteSubViews(b, linalgOp, linalgOptions, layout);		auto res = ::promoteSubViews(b, linalgOp, linalgOptions, layout);
if (failed(res))		if (failed(res))
return failure();		return failure();
return res;		return res;
}		}

		/// Allocate the given subview to a memory address space in GPU by creating a
		/// allocation operation and setting the memref type address space to desired
		/// address space.
		static Optional<Value> allocateSubviewGPUMemoryInAddressSpace(
		ftynseUnsubmitted Done Reply Inline Actions Ultra-nit: extra whitespace between the function and its documentation. ftynse: Ultra-nit: extra whitespace between the function and its documentation.
		OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
		gpu::AddressSpace addressSpace) {
		OpBuilder::InsertionGuard guard(builder);

		func::FuncOp funcOp = subview->getParentOfType<func::FuncOp>();
		if (!funcOp)
		return std::nullopt;

		// The subview size bounds are expected to be constant; they specify the shape
		// of the allocation.
		SmallVector<int64_t> shape;
		for (Value bound : sizeBounds) {
		APInt value;
		if (!matchPattern(bound, m_ConstantInt(&value)))
		return std::nullopt;
		shape.push_back(value.getSExtValue());
		}

		builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin());
		auto type = MemRefType::get(
		shape, subview.getType().getElementType(), MemRefLayoutAttrInterface{},
		gpu::AddressSpaceAttr::get(builder.getContext(), addressSpace));
		Value buffer;
		if (addressSpace == gpu::GPUDialect::getWorkgroupAddressSpace()) {
		buffer = builder.create<memref::AllocOp>(funcOp.getLoc(), type);
		} else if (addressSpace == gpu::GPUDialect::getPrivateAddressSpace()) {
		buffer = builder.create<memref::AllocaOp>(funcOp.getLoc(), type);
		} else {
		return std::nullopt;
		}
		return buffer;
		}

		/// Allocate the subview in the GPU workgroup memory.
		Optional<Value> mlir::linalg::allocateWorkgroupMemory(
		OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
		DataLayout &) {
		return allocateSubviewGPUMemoryInAddressSpace(
		builder, subview, sizeBounds,
		gpu::GPUDialect::getWorkgroupAddressSpace());
		}

		/// In case of GPU group memory there is no need to deallocate.
		LogicalResult mlir::linalg::deallocateWorkgroupMemory(OpBuilder &,
		Value /buffer/) {
		return success();
		}

		/// Create Memref copy operations and add gpu barrier guards before and after
		/// the copy operation to ensure data integrity.
		LogicalResult mlir::linalg::copyToWorkgroupMemory(OpBuilder &b, Value src,
		Value dst) {
		b.create<gpu::BarrierOp>(src.getLoc());
		Operation *copyOp = b.create<memref::CopyOp>(src.getLoc(), src, dst);
		b.create<gpu::BarrierOp>(copyOp->getLoc());
		return success();
		}

		/// Allocate the subview in the GPU private memory.
		Optional<Value> mlir::linalg::allocateGPUPrivateMemory(
		OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
		DataLayout &) {
		return allocateSubviewGPUMemoryInAddressSpace(
		builder, subview, sizeBounds, gpu::GPUDialect::getPrivateAddressSpace());
		}

		/// Normal copy to between src and dst.
		LogicalResult mlir::linalg::copyToGPUPrivateMemory(OpBuilder &b, Value src,
		Value dst) {
		Operation *copyOp = b.create<memref::CopyOp>(src.getLoc(), src, dst);
		return success();
		}

		/// In case of GPU private memory there is no need to deallocate since the
		/// memory is freed when going outside of the scope.
		LogicalResult mlir::linalg::deallocateGPUPrivateMemory(OpBuilder &,
		Value /buffer/) {
		return success();
		}
		No newline at end of file

mlir/test/Dialect/Linalg/promote.mlir

	Show First 20 Lines • Show All 137 Lines • ▼ Show 20 Lines

	transform.sequence failures(propagate) {			transform.sequence failures(propagate) {
	^bb0(%arg1: !pdl.operation):			^bb0(%arg1: !pdl.operation):
	%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation			%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
	%1 = transform.structured.promote %0			%1 = transform.structured.promote %0
	}			}

	// -----			// -----
				func.func @gemm_shared(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
				{
				linalg.matmul ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
				outs(%c: memref<?x?xf32>)
				return
				}

				// CHECK: func @gemm_shared
				// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
				// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
				// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
				// CHECK: %[[alloc_A:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space<workgroup>>
				// CHECK: %[[alloc_B:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space<workgroup>>
				// CHECK-DAG: %[[C16:.*]] = arith.constant 16
				// CHECK-DAG: %[[C0:.*]] = arith.constant 0
				// CHECK-DAG: %[[C1:.*]] = arith.constant 1
				// CHECK: scf.for %{{.}} = %{{.}} to %{{.}} step %{{.}} {
				// CHECK: scf.for %{{.}} = %{{.}} to %{{.}} step %{{.}} {
				// CHECK: scf.for %{{.}} = %{{.}} to %{{.}} step %{{.}} {
				// CHECK: %[[subview_A:.]] = memref.subview {{.}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
				// CHECK: %[[subview_B:.]] = memref.subview {{.}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
				// CHECK: %[[subview_C:.]] = memref.subview {{.}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>

				// CHECK: %[[shared_A:.]] = memref.subview %[[alloc_B]][0, 0] [%{{.}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
				// CHECK: %[[shared_B:.]] = memref.subview %[[alloc_A]][0, 0] [%{{.}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>

				// CHECK-NEXT: gpu.barrier
				// CHECK-NEXT: memref.copy %[[subview_A]], %[[shared_A]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
				// CHECK-NEXT: gpu.barrier

				// CHECK-NEXT: gpu.barrier
				// CHECK-NEXT: memref.copy %[[subview_B]], %[[shared_B]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
				// CHECK-NEXT: gpu.barrier

				// CHECK: linalg.matmul ins(%[[shared_A]], %[[shared_B]]{{.*}} outs(%[[subview_C]]


				transform.sequence failures(propagate) {
				^bb0(%arg1: !pdl.operation):
				%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
				%1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
				%2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space<workgroup>] }
				}


				// -----

				func.func @gemm_private(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
				{
				linalg.matmul ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
				outs(%c: memref<?x?xf32>)
				return
				}

				// CHECK: func @gemm_private
				// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
				// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
				// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
				// CHECK: %[[alloc_A:.*]] = memref.alloca() : memref<16x16xf32, #gpu.address_space<private>>
				// CHECK: %[[alloc_B:.*]] = memref.alloca() : memref<16x16xf32, #gpu.address_space<private>>
				// CHECK-DAG: %[[C16:.*]] = arith.constant 16
				// CHECK-DAG: %[[C0:.*]] = arith.constant 0
				// CHECK-DAG: %[[C1:.*]] = arith.constant 1
				// CHECK: scf.for %{{.}} = %{{.}} to %{{.}} step %{{.}} {
				// CHECK: scf.for %{{.}} = %{{.}} to %{{.}} step %{{.}} {
				// CHECK: scf.for %{{.}} = %{{.}} to %{{.}} step %{{.}} {
				// CHECK: %[[subview_A:.]] = memref.subview {{.}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
				// CHECK: %[[subview_B:.]] = memref.subview {{.}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
				// CHECK: %[[subview_C:.]] = memref.subview {{.}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>

				// CHECK: %[[private_A:.]] = memref.subview %[[alloc_B]][0, 0] [%{{.}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<private>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>
				// CHECK: %[[private_B:.]] = memref.subview %[[alloc_A]][0, 0] [%{{.}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<private>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>

				// CHECK-NEXT: memref.copy %[[subview_A]], %[[private_A]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>
				// CHECK-NEXT: memref.copy %[[subview_B]], %[[private_B]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>

				// CHECK: linalg.matmul ins(%[[private_A]], %[[private_B]]{{.*}} outs(%[[subview_C]]


				transform.sequence failures(propagate) {
				^bb0(%arg1: !pdl.operation):
				%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
				%1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
				%2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space<private>] }
				}


				// -----

	#map6 = affine_map<(d0, d1, d2) -> (d0, d2)>			#map6 = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map7 = affine_map<(d0, d1, d2) -> (d1, d2)>			#map7 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map8 = affine_map<(d0, d1, d2) -> (d0, d1)>			#map8 = affine_map<(d0, d1, d2) -> (d0, d1)>

	// CHECK: promote_rank_reducing_subviews(%[[arg0:.+]]: memref<{{.}}>, %[[arg1:.+]]: memref<{{.}}>, %[[arg2:.+]]: memref<{{.*}}>, %[[lb1:.+]]: index, %[[lb2:.+]]: index, %[[lb3:.+]]: index, %[[lb4:.+]]: index, %[[lb5:.+]]: index, %[[lb6:.+]]: index, %[[ub1:.+]]: index, %[[ub2:.+]]: index			// CHECK: promote_rank_reducing_subviews(%[[arg0:.+]]: memref<{{.}}>, %[[arg1:.+]]: memref<{{.}}>, %[[arg2:.+]]: memref<{{.*}}>, %[[lb1:.+]]: index, %[[lb2:.+]]: index, %[[lb3:.+]]: index, %[[lb4:.+]]: index, %[[lb5:.+]]: index, %[[lb6:.+]]: index, %[[ub1:.+]]: index, %[[ub2:.+]]: index
	func.func @promote_rank_reducing_subviews(%arg0: memref<?x?x?x64xf32, strided<[?, ?, ?, ?], offset: ?>>, %arg1: memref<128x3x3x64xf32, strided<[?, ?, ?, ?], offset: ?>>, %arg2: memref<?x?x?x128xf32>,			func.func @promote_rank_reducing_subviews(%arg0: memref<?x?x?x64xf32, strided<[?, ?, ?, ?], offset: ?>>, %arg1: memref<128x3x3x64xf32, strided<[?, ?, ?, ?], offset: ?>>, %arg2: memref<?x?x?x128xf32>,
	%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %ub1: index, %ub2: index) {			%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %ub1: index, %ub2: index) {
	Show All 36 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][LinAlg][Transform][GPU] Add GPU memory hierarchy to the transform.promote op
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 500779

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp

mlir/test/Dialect/Linalg/promote.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][LinAlg][Transform][GPU] Add GPU memory hierarchy to the transform.promote opClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 500779

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp

mlir/test/Dialect/Linalg/promote.mlir

[mlir][LinAlg][Transform][GPU] Add GPU memory hierarchy to the transform.promote op
ClosedPublic