Diff 236977

mlir/include/mlir/Dialect/GPU/GPUDialect.h

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	public:

/// Get the name of the attribute used to annotate kernel modules.		/// Get the name of the attribute used to annotate kernel modules.
static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }		static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }

/// Returns whether the given function is a kernel function, i.e., has the		/// Returns whether the given function is a kernel function, i.e., has the
/// 'gpu.kernel' attribute.		/// 'gpu.kernel' attribute.
static bool isKernel(Operation *op);		static bool isKernel(Operation *op);

		/// Returns the number of workgroup (thread, block) dimensions supported in
		/// the GPU dialect.
		// TODO(zinenko,herhut): consider generalizing this.
		static unsigned getNumWorkgroupDimensions() { return 3; }

/// Returns the numeric value used to identify the workgroup memory address		/// Returns the numeric value used to identify the workgroup memory address
/// space.		/// space.
static unsigned getWorkgroupAddressSpace() { return 3; }		static unsigned getWorkgroupAddressSpace() { return 3; }

/// Returns the numeric value used to identify the private memory address		/// Returns the numeric value used to identify the private memory address
/// space.		/// space.
static unsigned getPrivateAddressSpace() { return 5; }		static unsigned getPrivateAddressSpace() { return 5; }

Show All 19 Lines

mlir/include/mlir/Dialect/GPU/GPUOps.td

Show First 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
let builders = [		let builders = [
OpBuilder<"Builder *builder, OperationState &result, StringRef name, "		OpBuilder<"Builder *builder, OperationState &result, StringRef name, "
"FunctionType type, ArrayRef<Type> workgroupAttributions = {}, "		"FunctionType type, ArrayRef<Type> workgroupAttributions = {}, "
"ArrayRef<Type> privateAttributions = {}, "		"ArrayRef<Type> privateAttributions = {}, "
"ArrayRef<NamedAttribute> attrs = {}">		"ArrayRef<NamedAttribute> attrs = {}">
];		];

let extraClassDeclaration = [{		let extraClassDeclaration = [{
		/// Adds a workgroup attribution of the MemRef type with the given shape and
		rriddleUnsubmitted Not Done Reply Inline Actions What is "op" here? rriddle: What is "op" here?
		/// element type.
		Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
		rriddleUnsubmitted Done Reply Inline Actions BTW: Value is now value-typed. rriddle: BTW: Value is now value-typed.

/// Returns `true` if the GPU function defined by this Op is a kernel, i.e.		/// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
/// it is intended to be launched from host.		/// it is intended to be launched from host.
bool isKernel() {		bool isKernel() {
return getAttrOfType<UnitAttr>(GPUDialect::getKernelFuncAttrName()) !=		return getAttrOfType<UnitAttr>(GPUDialect::getKernelFuncAttrName()) !=
nullptr;		nullptr;
}		}

/// Returns the type of the function this Op defines.		/// Returns the type of the function this Op defines.
▲ Show 20 Lines • Show All 460 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/GPU/MemoryPromotion.h

This file was added.

				//===- MemoryPromotion.h - Utilities for moving data across GPU -- C++ --===//
				//
				// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				rriddleUnsubmitted Done Reply Inline Actions This should be "MLIR Project" rriddle: This should be "MLIR Project"
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This header file declares the utility functions that generate IR copying
				// the data between different levels of memory hierarchy.
				//
				//===----------------------------------------------------------------------===//

				#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
				#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H

				namespace mlir {

				namespace gpu {
				class GPUFuncOp;
				}

				/// Promotes a function argument to workgroup memory in the given function. The
				/// copies will be inserted in the beginning and in the end of the function.
				void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);

				} // end namespace mlir

				#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H

mlir/include/mlir/IR/Block.h

Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	public:
reverse_args_iterator args_rbegin() { return getArguments().rbegin(); }		reverse_args_iterator args_rbegin() { return getArguments().rbegin(); }
reverse_args_iterator args_rend() { return getArguments().rend(); }		reverse_args_iterator args_rend() { return getArguments().rend(); }

bool args_empty() { return arguments.empty(); }		bool args_empty() { return arguments.empty(); }

/// Add one value to the argument list.		/// Add one value to the argument list.
BlockArgument addArgument(Type type);		BlockArgument addArgument(Type type);

		/// Insert one value to the position in the argument list indicated by the
		/// given iterator. The existing arguments are shifted. The block is expected
		/// not to have predecessors.
		BlockArgument insertArgument(args_iterator it, Type type);

/// Add one argument to the argument list for each type specified in the list.		/// Add one argument to the argument list for each type specified in the list.
iterator_range<args_iterator> addArguments(ArrayRef<Type> types);		iterator_range<args_iterator> addArguments(ArrayRef<Type> types);

/// Erase the argument at 'index' and remove it from the argument list. If		/// Erase the argument at 'index' and remove it from the argument list. If
/// 'updatePredTerms' is set to true, this argument is also removed from the		/// 'updatePredTerms' is set to true, this argument is also removed from the
/// terminators of each predecessor to this block.		/// terminators of each predecessor to this block.
void eraseArgument(unsigned index, bool updatePredTerms = true);		void eraseArgument(unsigned index, bool updatePredTerms = true);

▲ Show 20 Lines • Show All 246 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/CMakeLists.txt

	add_llvm_library(MLIRGPU			add_llvm_library(MLIRGPU
	IR/GPUDialect.cpp			IR/GPUDialect.cpp
	IR/DialectRegistration.cpp			IR/DialectRegistration.cpp
	Transforms/KernelOutlining.cpp			Transforms/KernelOutlining.cpp
				Transforms/MemoryPromotion.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU			${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
	)			)
	add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)			add_dependencies(MLIRGPU
	target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)			MLIRGPUOpsIncGen
				MLIREDSC
				MLIRIR
				MLIRLLVMIR
				MLIRLoopOps
				MLIRSupport
				MLIRTransformUtils
				LLVMSupport)
				target_link_libraries(MLIRGPU
				MLIREDSC
				MLIRIR
				MLIRLLVMIR
				MLIRLoopOps
				MLIRSupport
				MLIRTransformUtils
				mehdi_aminiUnsubmitted Done Reply Inline Actions I see locally this error: CMake Error at .../llvm-project/mlir/lib/Dialect/GPU/CMakeLists.txt:18 (target_link_libraries): Target "MLIRGPUOpsIncGen" of type UTILITY may not be linked into another target. One may link only to INTERFACE, OBJECT, STATIC or SHARED libraries, or to executables with the ENABLE_EXPORTS property set. mehdi_amini: I see locally this error: ``` CMake Error at .../llvm-project/mlir/lib/Dialect/GPU/CMakeLists.
				LLVMSupport)

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Show First 20 Lines • Show All 587 Lines • ▼ Show 20 Lines	LogicalResult verify(LaunchFuncOp op) {

return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// GPUFuncOp		// GPUFuncOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

		/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
		/// and element type.
		Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
		Type elementType) {
		unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
		Block &bodyBlock = body().front();
		Value attribution = bodyBlock.insertArgument(
		std::next(bodyBlock.args_begin(), pos),
		MemRefType::get(shape, elementType, /affineMapComposition=/{},
		GPUDialect::getWorkgroupAddressSpace()));
		auto numWorkgroupBuffersAttr =
		getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
		setAttr(getNumWorkgroupAttributionsAttrName(),
		IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
		numWorkgroupBuffersAttr.getValue() + 1));
		return attribution;
		}

void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,		void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
FunctionType type, ArrayRef<Type> workgroupAttributions,		FunctionType type, ArrayRef<Type> workgroupAttributions,
ArrayRef<Type> privateAttributions,		ArrayRef<Type> privateAttributions,
ArrayRef<NamedAttribute> attrs) {		ArrayRef<NamedAttribute> attrs) {
result.addAttribute(SymbolTable::getSymbolAttrName(),		result.addAttribute(SymbolTable::getSymbolAttrName(),
builder->getStringAttr(name));		builder->getStringAttr(name));
result.addAttribute(getTypeAttrName(), TypeAttr::get(type));		result.addAttribute(getTypeAttrName(), TypeAttr::get(type));
result.addAttribute(getNumWorkgroupAttributionsAttrName(),		result.addAttribute(getNumWorkgroupAttributionsAttrName(),
▲ Show 20 Lines • Show All 218 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp

This file was added.

				//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements utilities that allow one to create IR moving the data
				// across different levels of the GPU memory hierarchy.
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Dialect/GPU/MemoryPromotion.h"
				#include "mlir/Dialect/GPU/GPUDialect.h"
				#include "mlir/Dialect/LoopOps/LoopOps.h"
				#include "mlir/EDSC/Builders.h"
				#include "mlir/EDSC/Helpers.h"
				#include "mlir/Pass/Pass.h"
				#include "mlir/Support/Functional.h"
				#include "mlir/Transforms/LoopUtils.h"

				using namespace mlir;
				using namespace mlir::gpu;

				/// Returns the textual name of a GPU dimension.
				static StringRef getDimName(unsigned dim) {
				if (dim == 0)
				return "x";
				if (dim == 1)
				return "y";
				if (dim == 2)
				return "z";

				llvm_unreachable("dimension ID overflow");
				}

				/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
				/// values using the bounds derived from the "from" value. Emits at least
				/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
				/// single-iteration loops. Maps the innermost loops to thread dimensions, in
				/// reverse order to enable access coalescing in the innermost loop.
				static void insertCopyLoops(OpBuilder &builder, Location loc,
				edsc::MemRefView &bounds, Value from, Value to) {
				// Create EDSC handles for bounds.
				unsigned rank = bounds.rank();
				SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;

				// Make sure we have enough loops to use all thread dimensions, these trivial
				// loops should be outermost and therefore inserted first.
				if (rank < GPUDialect::getNumWorkgroupDimensions()) {
				unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
				edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
				edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
				lbs.resize(extraLoops, zero);
				ubs.resize(extraLoops, one);
				steps.resize(extraLoops, one);
				}

				// Add existing bonuds.
				lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
				ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());

				// Emit constant operations for steps.
				steps.reserve(lbs.size());
				llvm::transform(
				bounds.getSteps(), std::back_inserter(steps),
				[](int64_t step) { return edsc::intrinsics::constant_index(step); });

				// Obtain thread identifiers and block sizes, necessary to map to them.
				auto indexType = builder.getIndexType();
				SmallVector<Value, 3> threadIds, blockDims;
				for (unsigned i = 0; i < 3; ++i) {
				auto dimName = builder.getStringAttr(getDimName(i));
				nicolasvasilacheUnsubmitted Done Reply Inline Actions To tie the previous suggestions together: use EDSC `MemRefView` (feel free to rename), to capture all lbs/ubs/steps add your dummy dims emit loops, they are captured automatically apply `mapToProcessorIds` on the k-innermost loops nicolasvasilache: To tie the previous suggestions together: 1. use EDSC `MemRefView` (feel free to rename), to…
				ftynseAuthorUnsubmitted Done Reply Inline Actions Done. The generated code is much uglier, but let's see if it triggers any issues wrt canonicalization between every step. ftynse: Done. The generated code is much uglier, but let's see if it triggers any issues wrt…
				threadIds.push_back(
				builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
				blockDims.push_back(
				builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
				}
				nicolasvasilacheUnsubmitted Done Reply Inline Actions No need to reimplement EDSCs for the special case of constants, you can just use them: `constant_index` See https://github.com/llvm/llvm-project/blob/master/mlir/test/EDSC/builder-api-test.cpp#L473 and code around for usage. nicolasvasilache: No need to reimplement EDSCs for the special case of constants, you can just use them…

				// Produce the loop nest with copies.
				auto ivs = edsc::makeIndexHandles(lbs.size());
				auto ivPtrs =
				nicolasvasilacheUnsubmitted Done Reply Inline Actions Since you're already using EDSCs, note that you already have support for this and the code above here: https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/EDSC/Helpers.h#L67 nicolasvasilache: Since you're already using EDSCs, note that you already have support for this and the code…
				edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
				edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
				auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
				edsc::StdIndexedValue fromHandle(from), toHandle(to);
				toHandle(activeIvs) = fromHandle(activeIvs);
				});

				// Map the innermost loops to threads in reverse order.
				for (auto en :
				llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
				GPUDialect::getNumWorkgroupDimensions())))) {
				auto loop = cast<loop::ForOp>(
				en.value().getValue().getParentRegion()->getParentOp());
				mapLoopToProcessorIds(loop, {threadIds[en.index()]},
				{blockDims[en.index()]});
				}
				}

				/// Emits the loop nests performing the copy to the designated location in the
				/// beginning of the region, and from the designated location immediately before
				/// the terminator of the first block of the region. The region is expected to
				/// have one block. This boils down to the following structure
				///
				/// ^bb(...):
				/// <loop-bound-computation>
				/// for %arg0 = ... to ... step ... {
				nicolasvasilacheUnsubmitted Done Reply Inline Actions Why roll your own special cases? You could just emit the loops as you do (note the loops themselves are captured since you use the EDSCs) and then apply: void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId, ArrayRef<Value> numProcessors); https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/Transforms/LoopUtils.h#L221 nicolasvasilache: Why roll your own special cases? You could just emit the loops as you do (note the loops…
				/// ...
				/// for %argN = <thread-id-x> to ... step <block-dim-x> {
				/// %0 = load %from[%arg0, ..., %argN]
				/// store %0, %to[%arg0, ..., %argN]
				/// }
				/// ...
				/// }
				/// gpu.barrier
				/// <... original body ...>
				/// gpu.barrier
				/// for %arg0 = ... to ... step ... {
				/// ...
				/// for %argN = <thread-id-x> to ... step <block-dim-x> {
				/// %1 = load %to[%arg0, ..., %argN]
				/// store %1, %from[%arg0, ..., %argN]
				/// }
				/// ...
				/// }
				///
				/// Inserts the barriers unconditionally since different threads may be copying
				/// values and reading them. An analysis would be required to eliminate barriers
				/// in case where value is only used by the thread that copies it. Both copies
				/// are inserted unconditionally, an analysis would be required to only copy
				/// live-in and live-out values when necessary. This copies the entire memref
				/// pointed to by "from". In case a smaller block would be sufficient, the
				/// caller can create a subview of the memref and promote it instead.
				static void insertCopies(Region &region, Location loc, Value from, Value to) {
				auto fromType = from.getType().cast<MemRefType>();
				auto toType = to.getType().cast<MemRefType>();
				(void)fromType;
				(void)toType;
				assert(fromType.getShape() == toType.getShape());
				assert(fromType.getRank() != 0);
				assert(has_single_element(region) &&
				rriddleUnsubmitted Done Reply Inline Actions Can you add a message to this assert? rriddle: Can you add a message to this assert?
				"unstructured control flow not supported");
				rriddleUnsubmitted Done Reply Inline Actions has_single_element rriddle: has_single_element

				OpBuilder builder(region.getContext());
				builder.setInsertionPointToStart(&region.front());

				edsc::ScopedContext edscContext(builder, loc);
				edsc::MemRefView fromView(from);
				insertCopyLoops(builder, loc, fromView, from, to);
				builder.create<gpu::BarrierOp>(loc);

				builder.setInsertionPoint(&region.front().back());
				builder.create<gpu::BarrierOp>(loc);
				insertCopyLoops(builder, loc, fromView, to, from);
				}

				/// Promotes a function argument to workgroup memory in the given function. The
				/// copies will be inserted in the beginning and in the end of the function.
				void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
				Value value = op.getArgument(arg);
				auto type = value.getType().dyn_cast<MemRefType>();
				assert(type && type.hasStaticShape() && "can only promote memrefs");

				Value attribution =
				op.addWorkgroupAttribution(type.getShape(), type.getElementType());

				// Replace the uses first since only the original uses are currently present.
				// Then insert the copies.
				value.replaceAllUsesWith(attribution);
				insertCopies(op.getBody(), op.getLoc(), value, attribution);
				}
				rriddleUnsubmitted Done Reply Inline Actions has_single_element rriddle: has_single_element

mlir/lib/IR/Block.cpp

Show First 20 Lines • Show All 173 Lines • ▼ Show 20 Lines	void Block::eraseArgument(unsigned index, bool updatePredTerms) {
// Erase this argument from each of the predecessor's terminator.		// Erase this argument from each of the predecessor's terminator.
for (auto predIt = pred_begin(), predE = pred_end(); predIt != predE;		for (auto predIt = pred_begin(), predE = pred_end(); predIt != predE;
++predIt) {		++predIt) {
auto predTerminator = (predIt)->getTerminator();		auto predTerminator = (predIt)->getTerminator();
predTerminator->eraseSuccessorOperand(predIt.getSuccessorIndex(), index);		predTerminator->eraseSuccessorOperand(predIt.getSuccessorIndex(), index);
}		}
}		}

		/// Insert one value to the given position of the argument list. The existing
		/// arguments are shifted. The block is expected not to have predecessors.
		BlockArgument Block::insertArgument(args_iterator it, Type type) {
		assert(llvm::empty(getPredecessors()) &&
		"cannot insert arguments to blocks with predecessors");

		// Use the args_iterator (on the BlockArgListType) to compute the insertion
		// iterator in the underlying argument storage.
		size_t distance = std::distance(args_begin(), it);
		auto arg = BlockArgument::create(type, this);
		arguments.insert(std::next(arguments.begin(), distance), arg);
		rriddleUnsubmitted Not Done Reply Inline Actions Why can you not use it directly? rriddle: Why can you not use it directly?
		ftynseAuthorUnsubmitted Done Reply Inline Actions Use what? `args_iterator` and `arguments.begin()` iterate through different things. ftynse: Use what? `args_iterator` and `arguments.begin()` iterate through different things.
		rriddleUnsubmitted Done Reply Inline Actions (Sorry, I meant 'it') Ahh yeah, I forgot about that. rriddle: (Sorry, I meant 'it') Ahh yeah, I forgot about that.
		return arg;
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Terminator management		// Terminator management
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Get the terminator operation of this block. This function asserts that		/// Get the terminator operation of this block. This function asserts that
/// the block has a valid terminator operation.		/// the block has a valid terminator operation.
Operation *Block::getTerminator() {		Operation *Block::getTerminator() {
assert(!empty() && !back().isKnownNonTerminator());		assert(!empty() && !back().isKnownNonTerminator());
▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

mlir/test/Dialect/GPU/promotion.mlir

This file was added.

				// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s \| FileCheck %s

				module @foo attributes {gpu.kernel_module} {
				// Verify that the attribution was indeed introduced
				// CHECK-LABEL: @memref3d
				// CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
				// CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
				gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
				// Verify that loop bounds are emitted, the order does not matter.
				// CHECK-DAG: %[[c1:.*]] = constant 1
				// CHECK-DAG: %[[c4:.*]] = constant 4
				// CHECK-DAG: %[[c5:.*]] = constant 5
				// CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
				// CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
				// CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
				// CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
				// CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
				// CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}

				// Verify that loops for the copy are emitted. We only check the number of
				// loops here since their bounds are produced by mapLoopToProcessorIds,
				// tested separately.
				// CHECK: loop.for %[[i0:.*]] =
				// CHECK: loop.for %[[i1:.*]] =
				// CHECK: loop.for %[[i2:.*]] =

				// Verify that the copy is emitted and uses only the last two loops.
				// CHECK: %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
				// CHECK: store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]

				// Verify that the use has been rewritten.
				// CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
				"use"(%arg0) : (memref<5x4xf32>) -> ()


				// Verify that loops for the copy are emitted. We only check the number of
				// loops here since their bounds are produced by mapLoopToProcessorIds,
				// tested separately.
				// CHECK: loop.for %[[i0:.*]] =
				// CHECK: loop.for %[[i1:.*]] =
				// CHECK: loop.for %[[i2:.*]] =

				// Verify that the copy is emitted and uses only the last two loops.
				// CHECK: %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
				// CHECK: store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
				gpu.return
				}
				}

				// -----

				module @foo attributes {gpu.kernel_module} {
				// Verify that the attribution was indeed introduced
				// CHECK-LABEL: @memref5d
				// CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
				// CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
				gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
				// Verify that loop bounds are emitted, the order does not matter.
				// CHECK-DAG: %[[c0:.*]] = constant 0
				// CHECK-DAG: %[[c1:.*]] = constant 1
				// CHECK-DAG: %[[c4:.*]] = constant 4
				// CHECK-DAG: %[[c5:.*]] = constant 5
				// CHECK-DAG: %[[c6:.*]] = constant 6
				// CHECK-DAG: %[[c7:.*]] = constant 7
				// CHECK-DAG: %[[c8:.*]] = constant 8
				// CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
				// CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
				// CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
				// CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
				// CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
				// CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}

				// Verify that loops for the copy are emitted.
				// CHECK: loop.for %[[i0:.*]] =
				// CHECK: loop.for %[[i1:.*]] =
				// CHECK: loop.for %[[i2:.*]] =
				// CHECK: loop.for %[[i3:.*]] =
				// CHECK: loop.for %[[i4:.*]] =

				// Verify that the copy is emitted.
				// CHECK: %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
				// CHECK: store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]

				// Verify that the use has been rewritten.
				// CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
				"use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()

				// Verify that loop loops for the copy are emitted.
				// CHECK: loop.for %[[i0:.*]] =
				// CHECK: loop.for %[[i1:.*]] =
				// CHECK: loop.for %[[i2:.*]] =
				// CHECK: loop.for %[[i3:.*]] =
				// CHECK: loop.for %[[i4:.*]] =

				// Verify that the copy is emitted.
				// CHECK: %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
				// CHECK: store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
				gpu.return
				}
				}

				// -----

				module @foo attributes {gpu.kernel_module} {
				// Check that attribution insertion works fine.
				// CHECK-LABEL: @insert
				// CHECK-SAME: (%{{.*}}: memref<4xf32>
				// CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
				// CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
				// CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
				gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
				workgroup(%arg1: memref<1x1xf64, 3>)
				private(%arg2: memref<1x1xi64, 5>)
				kernel {
				// CHECK: "use"(%[[wg2]])
				"use"(%arg0) : (memref<4xf32>) -> ()
				gpu.return
				}
				}

mlir/test/lib/Transforms/CMakeLists.txt

	add_llvm_library(MLIRTestTransforms			add_llvm_library(MLIRTestTransforms
	TestCallGraph.cpp			TestCallGraph.cpp
	TestConstantFold.cpp			TestConstantFold.cpp
	TestLoopFusion.cpp			TestLoopFusion.cpp
				TestGpuMemoryPromotion.cpp
	TestInlining.cpp			TestInlining.cpp
	TestLinalgTransforms.cpp			TestLinalgTransforms.cpp
	TestLiveness.cpp			TestLiveness.cpp
	TestLoopMapping.cpp			TestLoopMapping.cpp
	TestLoopParametricTiling.cpp			TestLoopParametricTiling.cpp
	TestOpaqueLoc.cpp			TestOpaqueLoc.cpp
	TestMemRefStrideCalculation.cpp			TestMemRefStrideCalculation.cpp
	TestVectorToLoopsConversion.cpp			TestVectorToLoopsConversion.cpp
	TestVectorTransforms.cpp			TestVectorTransforms.cpp
	TestVectorizationUtils.cpp			TestVectorizationUtils.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms			${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
	)			)
	include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../TestDialect)			include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../TestDialect)
	include_directories(${CMAKE_CURRENT_BINARY_DIR}/../TestDialect)			include_directories(${CMAKE_CURRENT_BINARY_DIR}/../TestDialect)
	include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../DeclarativeTransforms)			include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../DeclarativeTransforms)
	include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms)			include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms)
	add_dependencies(MLIRTestTransforms MLIRStandardOpsIncGen)			add_dependencies(MLIRTestTransforms MLIRStandardOpsIncGen)
	add_dependencies(MLIRTestTransforms MLIRTestLinalgTransformPatternsIncGen)			add_dependencies(MLIRTestTransforms MLIRTestLinalgTransformPatternsIncGen)
	add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)			add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
	target_link_libraries(MLIRTestTransforms			target_link_libraries(MLIRTestTransforms
	MLIRAffineOps			MLIRAffineOps
	MLIRAnalysis			MLIRAnalysis
				MLIREDSC
				MLIRGPU
	MLIRLoopOps			MLIRLoopOps
	MLIRPass			MLIRPass
	MLIRTestDialect			MLIRTestDialect
	MLIRVectorOps			MLIRVectorOps
	)			)

mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp

This file was added.

				//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
				//
				// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				rriddleUnsubmitted Done Reply Inline Actions Same here. rriddle: Same here.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements the pass testing the utilities for moving data across
				// different levels of the GPU memory hierarchy.
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Dialect/GPU/GPUDialect.h"
				#include "mlir/Dialect/GPU/MemoryPromotion.h"
				#include "mlir/IR/Attributes.h"
				#include "mlir/Pass/Pass.h"

				using namespace mlir;

				namespace {
				/// Simple pass for testing the promotion to workgroup memory in GPU functions.
				/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
				/// does not check whether the promotion is legal (e.g., amount of memory used)
				/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
				class TestGpuMemoryPromotionPass
				: public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
				void runOnOperation() override {
				gpu::GPUFuncOp op = getOperation();
				for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
				if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
				promoteToWorkgroupMemory(op, i);
				}
				}
				};
				} // end namespace

				static PassRegistration<TestGpuMemoryPromotionPass> registration(
				"test-gpu-memory-promotion",
				"Promotes the annotated arguments of gpu.func to workgroup memory.");

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] GPU: introduce utilities for promotion to workgroup memory
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 236977

mlir/include/mlir/Dialect/GPU/GPUDialect.h

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/include/mlir/Dialect/GPU/MemoryPromotion.h

mlir/include/mlir/IR/Block.h

mlir/lib/Dialect/GPU/CMakeLists.txt

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp

mlir/lib/IR/Block.cpp

mlir/test/Dialect/GPU/promotion.mlir

mlir/test/lib/Transforms/CMakeLists.txt

mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] GPU: introduce utilities for promotion to workgroup memoryClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 236977

mlir/include/mlir/Dialect/GPU/GPUDialect.h

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/include/mlir/Dialect/GPU/MemoryPromotion.h

mlir/include/mlir/IR/Block.h

mlir/lib/Dialect/GPU/CMakeLists.txt

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp

mlir/lib/IR/Block.cpp

mlir/test/Dialect/GPU/promotion.mlir

mlir/test/lib/Transforms/CMakeLists.txt

mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp

[mlir] GPU: introduce utilities for promotion to workgroup memory
ClosedPublic