This is an archive of the discontinued LLVM Phabricator instance.

mlir/include/mlir/Dialect/GPU/GPUOps.td
810	Not for now but I was wondering whether we should have a different resource for GPU allocations? Maybe also to use as a key for buffer assignment to insert the corresponding free.
mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
180	These should implement the public `matchAndRewrite(gpu::AllocOp op, ArrayRef<Value> operands, ConversionPatternRewriter &rewriter` instead of overriding the private one. Just noticed that all patterns currently do the latter.
345	Use the `gpu::AllocOp` typed version.
350	Instead of `match`, would it be more natural to use `is` or `have` or something?
658–662	nit: sort.
mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
7	Maybe test with a bit more context to also capture that streams are passed correctly? I think it is ok to ignore the memref construction part, though.

This revision now requires changes to proceed.Nov 23 2020, 6:36 AM

Thanks for the review!

Herald added a subscriber: nimiwio. · View Herald TranscriptNov 24 2020, 1:08 PM

csigg marked an inline comment as done.Nov 24 2020, 1:09 PM

csigg added inline comments.

mlir/include/mlir/Dialect/GPU/GPUOps.td
810	Ack, makes sense. I can look into it.
mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
180	That overload does not currently exist. I can add it to `ConvertOpToLLVMPattern` in a separate change and make this one final, just like in `OpRewritePattern`.
345	See comment above.

Harbormaster completed remote builds in B80008: Diff 307439.Nov 24 2020, 1:23 PM

herhut accepted this revision.Nov 26 2020, 5:04 AM

This revision is now accepted and ready to land.Nov 26 2020, 5:04 AM

Closed by commit rG5535696c386b: [mlir] Add gpu.allocate, gpu.deallocate ops with LLVM lowering to runtime… (authored by csigg). · Explain WhyNov 27 2020, 12:41 AM

This revision was automatically updated to reflect the committed changes.

csigg added a commit: rG5535696c386b: [mlir] Add gpu.allocate, gpu.deallocate ops with LLVM lowering to runtime….

Revision Contents

Path

Size

mlir/

include/

mlir/

Dialect/

GPU/

GPUDialect.h

1 line

GPUOps.td

75 lines

lib/

Conversion/

GPUCommon/

ConvertLaunchFuncToRuntimeCalls.cpp

146 lines

test/

Conversion/

GPUCommon/

lower-alloc-to-gpu-runtime-calls.mlir

19 lines

Dialect/

GPU/

ops.mlir

17 lines

tools/

mlir-cuda-runner/

cuda-runtime-wrappers.cpp

10 lines

mlir-rocm-runner/

rocm-runtime-wrappers.cpp

10 lines

Diff 307977

mlir/include/mlir/Dialect/GPU/GPUDialect.h

	Show All 13 Lines
	#ifndef MLIR_DIALECT_GPU_GPUDIALECT_H			#ifndef MLIR_DIALECT_GPU_GPUDIALECT_H
	#define MLIR_DIALECT_GPU_GPUDIALECT_H			#define MLIR_DIALECT_GPU_GPUDIALECT_H

	#include "mlir/IR/Builders.h"			#include "mlir/IR/Builders.h"
	#include "mlir/IR/Dialect.h"			#include "mlir/IR/Dialect.h"
	#include "mlir/IR/FunctionSupport.h"			#include "mlir/IR/FunctionSupport.h"
	#include "mlir/IR/OpDefinition.h"			#include "mlir/IR/OpDefinition.h"
	#include "mlir/IR/OpImplementation.h"			#include "mlir/IR/OpImplementation.h"
				#include "mlir/IR/StandardTypes.h"
	#include "mlir/IR/SymbolTable.h"			#include "mlir/IR/SymbolTable.h"
	#include "mlir/Interfaces/SideEffectInterfaces.h"			#include "mlir/Interfaces/SideEffectInterfaces.h"

	namespace mlir {			namespace mlir {
	class FuncOp;			class FuncOp;

	namespace gpu {			namespace gpu {

	Show All 29 Lines

mlir/include/mlir/Dialect/GPU/GPUOps.td

Show First 20 Lines • Show All 798 Lines • ▼ Show 20 Lines	def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
let results = (outs Optional<GPU_AsyncToken>:$asyncToken);		let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

let assemblyFormat = [{		let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
}];		}];
}		}

		def GPU_AllocOp : GPU_Op<"alloc", [
		GPU_AsyncOpInterface,
		AttrSizedOperandSegments,
		MemoryEffects<[MemAlloc<DefaultResource>]>
		herhutUnsubmitted Not Done Reply Inline Actions Not for now but I was wondering whether we should have a different resource for GPU allocations? Maybe also to use as a key for buffer assignment to insert the corresponding free. herhut: Not for now but I was wondering whether we should have a different resource for GPU allocations?
		csiggAuthorUnsubmitted Done Reply Inline Actions Ack, makes sense. I can look into it. csigg: Ack, makes sense. I can look into it.
		]> {

		let summary = "GPU memory allocation operation.";
		let description = [{
		The `gpu.alloc` operation allocates a region of memory on the GPU. It is
		similar to the `std.alloc` op, but supports asynchronous GPU execution.

		The op does not execute before all async dependencies have finished
		executing.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it also returns a !gpu.async.token.

		Example:

		```mlir
		%memref, %token = gpu.alloc async [%dep] (%width) : memref<64x?xf32, 1>
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Variadic<Index>:$dynamicSizes, Variadic<Index>:$symbolOperands);
		let results = (outs Res<AnyMemRef, "", [MemAlloc<DefaultResource>]>:$memref,
		Optional<GPU_AsyncToken>:$asyncToken);

		let extraClassDeclaration = [{
		MemRefType getType() { return memref().getType().cast<MemRefType>(); }
		}];

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) ` `
		`(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref)
		}];
		}

		def GPU_DeallocOp : GPU_Op<"dealloc", [
		GPU_AsyncOpInterface, MemoryEffects<[MemFree]>
		]> {

		let summary = "GPU memory deallocation operation";

		let description = [{
		The `gpu.dealloc` operation frees the region of memory referenced by a
		memref which was originally created by the `gpu.alloc` operation. It is
		similar to the `std.dealloc` op, but supports asynchronous GPU execution.

		The op does not execute before all async dependencies have finished
		executing.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token.

		Example:

		```mlir
		%token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1>
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Arg<AnyMemRef, "", [MemFree]>:$memref);
		let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
		$memref attr-dict `:` type($memref)
		}];
		}

#endif // GPU_OPS		#endif // GPU_OPS

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

Show First 20 Lines • Show All 136 Lines • ▼ Show 20 Lines	FunctionCallBuilder eventRecordCallBuilder = {
llvmVoidType,		llvmVoidType,
{llvmPointerType /* void event /, llvmPointerType /* void stream /}};		{llvmPointerType /* void event /, llvmPointerType /* void stream /}};
FunctionCallBuilder hostRegisterCallBuilder = {		FunctionCallBuilder hostRegisterCallBuilder = {
"mgpuMemHostRegisterMemRef",		"mgpuMemHostRegisterMemRef",
llvmVoidType,		llvmVoidType,
{llvmIntPtrType /* intptr_t rank */,		{llvmIntPtrType /* intptr_t rank */,
llvmPointerType /* void memrefDesc /,		llvmPointerType /* void memrefDesc /,
llvmIntPtrType /* intptr_t elementSizeBytes */}};		llvmIntPtrType /* intptr_t elementSizeBytes */}};
		FunctionCallBuilder allocCallBuilder = {
		"mgpuMemAlloc",
		llvmPointerType /* void * */,
		{llvmIntPtrType /* intptr_t sizeBytes */,
		llvmPointerType /* void stream /}};
		FunctionCallBuilder deallocCallBuilder = {
		"mgpuMemFree",
		llvmVoidType,
		{llvmPointerType /* void ptr /, llvmPointerType /* void stream /}};
};		};

/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime		/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).		/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertHostRegisterOpToGpuRuntimeCallPattern		class ConvertHostRegisterOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {		: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
public:		public:
ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)		ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}		: ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}

private:		private:
LogicalResult		LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,		matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;		ConversionPatternRewriter &rewriter) const override;
};		};

		/// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
		/// call. Currently it supports CUDA and ROCm (HIP).
		class ConvertAllocOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> {
		public:
		ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(Operation *op, ArrayRef<Value> operands,
		herhutUnsubmitted Not Done Reply Inline Actions These should implement the public `matchAndRewrite(gpu::AllocOp op, ArrayRef<Value> operands, ConversionPatternRewriter &rewriter` instead of overriding the private one. Just noticed that all patterns currently do the latter. herhut: These should implement the public `matchAndRewrite(gpu::AllocOp op, ArrayRef<Value> operands…
		csiggAuthorUnsubmitted Done Reply Inline Actions That overload does not currently exist. I can add it to `ConvertOpToLLVMPattern` in a separate change and make this one final, just like in `OpRewritePattern`. csigg: That overload does not currently exist. I can add it to `ConvertOpToLLVMPattern` in a separate…
		ConversionPatternRewriter &rewriter) const override;
		};

		/// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
		/// call. Currently it supports CUDA and ROCm (HIP).
		class ConvertDeallocOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> {
		public:
		ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(Operation *op, ArrayRef<Value> operands,
		ConversionPatternRewriter &rewriter) const override;
		};

/// A rewrite pattern to convert gpu.wait operations into a GPU runtime		/// A rewrite pattern to convert gpu.wait operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).		/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertWaitOpToGpuRuntimeCallPattern		class ConvertWaitOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {		: public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
public:		public:
ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)		ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}		: ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}

▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
LogicalResult matchAndRewrite(gpu::GPUModuleOp op,		LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
// GPU kernel modules are no longer necessary since we have a global		// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN, or HSACO data.		// constant with the CUBIN, or HSACO data.
rewriter.eraseOp(op);		rewriter.eraseOp(op);
return success();		return success();
}		}
};		};

} // namespace		} // namespace

void GpuToLLVMConversionPass::runOnOperation() {		void GpuToLLVMConversionPass::runOnOperation() {
LLVMTypeConverter converter(&getContext());		LLVMTypeConverter converter(&getContext());
OwningRewritePatternList patterns;		OwningRewritePatternList patterns;
populateStdToLLVMConversionPatterns(converter, patterns);		populateStdToLLVMConversionPatterns(converter, patterns);
populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);		populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);

Show All 12 Lines	auto function = [&] {
return OpBuilder(module.getBody()->getTerminator())		return OpBuilder(module.getBody()->getTerminator())
.create<LLVM::LLVMFuncOp>(loc, functionName, functionType);		.create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
}();		}();
return builder.create<LLVM::CallOp>(		return builder.create<LLVM::CallOp>(
loc, const_cast<LLVM::LLVMType &>(functionType).getFunctionResultType(),		loc, const_cast<LLVM::LLVMType &>(functionType).getFunctionResultType(),
builder.getSymbolRefAttr(function), arguments);		builder.getSymbolRefAttr(function), arguments);
}		}

// Returns whether value is of LLVM type.		// Returns whether all operands are of LLVM type.
static bool isLLVMType(Value value) {		static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
		ConversionPatternRewriter &rewriter) {
		if (!llvm::all_of(operands, [](Value value) {
return value.getType().isa<LLVM::LLVMType>();		return value.getType().isa<LLVM::LLVMType>();
		}))
		return rewriter.notifyMatchFailure(
		op, "Cannot convert if operands aren't of LLVM type.");
		return success();
		}

		static LogicalResult
		isAsyncWithOneDependency(ConversionPatternRewriter &rewriter,
		gpu::AsyncOpInterface op) {
		if (op.getAsyncDependencies().size() != 1)
		return rewriter.notifyMatchFailure(
		op, "Can only convert with exactly one async dependency.");

		if (!op.getAsyncToken())
		return rewriter.notifyMatchFailure(op, "Can convert only async version.");

		return success();
}		}

LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(		LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
Operation *op, ArrayRef<Value> operands,		Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {		ConversionPatternRewriter &rewriter) const {
if (!llvm::all_of(operands, isLLVMType))		if (failed(areAllLLVMTypes(op, operands, rewriter)))
return rewriter.notifyMatchFailure(		return failure();
op, "Cannot convert if operands aren't of LLVM type.");

Location loc = op->getLoc();		Location loc = op->getLoc();

auto memRefType = cast<gpu::HostRegisterOp>(op).value().getType();		auto memRefType = cast<gpu::HostRegisterOp>(op).value().getType();
auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();		auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
auto elementSize = getSizeInBytes(loc, elementType, rewriter);		auto elementSize = getSizeInBytes(loc, elementType, rewriter);

auto arguments =		auto arguments =
typeConverter.promoteOperands(loc, op->getOperands(), operands, rewriter);		typeConverter.promoteOperands(loc, op->getOperands(), operands, rewriter);
arguments.push_back(elementSize);		arguments.push_back(elementSize);
hostRegisterCallBuilder.create(loc, rewriter, arguments);		hostRegisterCallBuilder.create(loc, rewriter, arguments);

rewriter.eraseOp(op);		rewriter.eraseOp(op);
return success();		return success();
}		}

		LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
		Operation *op, ArrayRef<Value> operands,
		herhutUnsubmitted Not Done Reply Inline Actions Use the `gpu::AllocOp` typed version. herhut: Use the `gpu::AllocOp` typed version.
		csiggAuthorUnsubmitted Done Reply Inline Actions See comment above. csigg: See comment above.
		ConversionPatternRewriter &rewriter) const {
		auto allocOp = cast<gpu::AllocOp>(op);
		MemRefType memRefType = allocOp.getType();

		if (failed(areAllLLVMTypes(op, operands, rewriter)) \|\|
		herhutUnsubmitted Done Reply Inline Actions Instead of `match`, would it be more natural to use `is` or `have` or something? herhut: Instead of `match`, would it be more natural to use `is` or `have` or something?
		!isSupportedMemRefType(memRefType) \|\|
		failed(
		isAsyncWithOneDependency(rewriter, cast<gpu::AsyncOpInterface>(op))))
		return failure();

		auto loc = op->getLoc();

		// Get shape of the memref as values: static sizes are constant
		// values and dynamic sizes are passed to 'alloc' as operands.
		SmallVector<Value, 4> shape;
		SmallVector<Value, 4> strides;
		Value sizeBytes;
		getMemRefDescriptorSizes(loc, memRefType, operands, rewriter, shape, strides,
		sizeBytes);

		// Allocate the underlying buffer and store a pointer to it in the MemRef
		// descriptor.
		Type elementPtrType = this->getElementPtrType(memRefType);
		auto adaptor = gpu::AllocOpAdaptor(operands, op->getAttrDictionary());
		auto stream = adaptor.asyncDependencies().front();
		Value allocatedPtr =
		allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0);
		allocatedPtr =
		rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);

		// No alignment.
		Value alignedPtr = allocatedPtr;

		// Create the MemRef descriptor.
		auto memRefDescriptor = this->createMemRefDescriptor(
		loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);

		rewriter.replaceOp(op, {memRefDescriptor, stream});

		return success();
		}

		LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
		Operation *op, ArrayRef<Value> operands,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, operands, rewriter)) \|\|
		failed(
		isAsyncWithOneDependency(rewriter, cast<gpu::AsyncOpInterface>(op))))
		return failure();

		Location loc = op->getLoc();

		auto adaptor = gpu::DeallocOpAdaptor(operands, op->getAttrDictionary());
		Value pointer =
		MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc);
		auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
		Value stream = adaptor.asyncDependencies().front();
		deallocCallBuilder.create(loc, rewriter, {casted, stream});

		rewriter.replaceOp(op, {stream});
		return success();
		}

// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm		// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
// streams (i.e. void*). The converted op synchronizes the host with every		// streams (i.e. void*). The converted op synchronizes the host with every
// stream and then destroys it. That is, it assumes that the stream is not used		// stream and then destroys it. That is, it assumes that the stream is not used
// afterwards. In case this isn't correct, we will get a runtime error.		// afterwards. In case this isn't correct, we will get a runtime error.
// Eventually, we will have a pass that guarantees this property.		// Eventually, we will have a pass that guarantees this property.
LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(		LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
Operation *op, ArrayRef<Value> operands,		Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {		ConversionPatternRewriter &rewriter) const {
▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
// call %streamDestroy(%4)		// call %streamDestroy(%4)
// call %moduleUnload(%1)		// call %moduleUnload(%1)
//		//
// If the op is async, the stream corresponds to the (single) async dependency		// If the op is async, the stream corresponds to the (single) async dependency
// as well as the async token the op produces.		// as well as the async token the op produces.
LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(		LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
Operation *op, ArrayRef<Value> operands,		Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {		ConversionPatternRewriter &rewriter) const {
if (!llvm::all_of(operands, isLLVMType))		if (failed(areAllLLVMTypes(op, operands, rewriter)))
return rewriter.notifyMatchFailure(		return failure();
op, "Cannot convert if operands aren't of LLVM type.");

auto launchOp = cast<gpu::LaunchFuncOp>(op);		auto launchOp = cast<gpu::LaunchFuncOp>(op);

if (launchOp.asyncDependencies().size() > 1)		if (launchOp.asyncDependencies().size() > 1)
return rewriter.notifyMatchFailure(		return rewriter.notifyMatchFailure(
op, "Cannot convert with more than one async dependency.");		op, "Cannot convert with more than one async dependency.");

// Fail when the synchronous version of the op has async dependencies. The		// Fail when the synchronous version of the op has async dependencies. The
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines

void mlir::populateGpuToLLVMConversionPatterns(		void mlir::populateGpuToLLVMConversionPatterns(
LLVMTypeConverter &converter, OwningRewritePatternList &patterns,		LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
StringRef gpuBinaryAnnotation) {		StringRef gpuBinaryAnnotation) {
converter.addConversion(		converter.addConversion(
[context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {		[context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
return LLVM::LLVMType::getInt8PtrTy(context);		return LLVM::LLVMType::getInt8PtrTy(context);
});		});
patterns.insert<ConvertHostRegisterOpToGpuRuntimeCallPattern,		patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern,		ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern>(converter);		ConvertHostRegisterOpToGpuRuntimeCallPattern,
		ConvertWaitAsyncOpToGpuRuntimeCallPattern,
		ConvertWaitOpToGpuRuntimeCallPattern>(converter);
		herhutUnsubmitted Done Reply Inline Actions nit: sort. herhut: nit: sort.
patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(		patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation);		converter, gpuBinaryAnnotation);
patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());		patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
}		}

mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir

This file was added.

				// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm \| FileCheck %s

				module attributes {gpu.container_module} {
				func @main() {
				// CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate()
				%0 = gpu.wait async
				// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint
				herhutUnsubmitted Done Reply Inline Actions Maybe test with a bit more context to also capture that streams are passed correctly? I think it is ok to ignore the memref construction part, though. herhut: Maybe test with a bit more context to also capture that streams are passed correctly? I think…
				// CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
				%1, %2 = gpu.alloc async [%0] () : memref<13xf32>
				// CHECK: %[[float_ptr:.]] = llvm.extractvalue {{.}}[0]
				// CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
				// CHECK: llvm.call @mgpuMemFree(%[[void_ptr]], %[[stream]])
				%3 = gpu.dealloc async [%2] %1 : memref<13xf32>
				// CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
				// CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
				gpu.wait [%3]
				return
				}
				}

mlir/test/Dialect/GPU/ops.mlir

Show First 20 Lines • Show All 138 Lines • ▼ Show 20 Lines	module attributes {gpu.container_module} {
gpu.module @explicit_attributions {		gpu.module @explicit_attributions {
// CHECK-LABEL: gpu.func @kernel_1({{.}}: f32, {{.}}: memref<?xf32>) workgroup({{.}}: memref<5xf32, 3>) private({{.}}: memref<5xf32, 5>)		// CHECK-LABEL: gpu.func @kernel_1({{.}}: f32, {{.}}: memref<?xf32>) workgroup({{.}}: memref<5xf32, 3>) private({{.}}: memref<5xf32, 5>)
"gpu.func"() ( {		"gpu.func"() ( {
^bb0(%arg0: f32, %arg1: memref<?xf32>, %arg2: memref<5xf32, 3>, %arg3: memref<5xf32, 5>):		^bb0(%arg0: f32, %arg1: memref<?xf32>, %arg2: memref<5xf32, 3>, %arg3: memref<5xf32, 5>):
"gpu.return"() : () -> ()		"gpu.return"() : () -> ()
} ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref<?xf32>) -> (), workgroup_attributions = 1: i64} : () -> ()		} ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref<?xf32>) -> (), workgroup_attributions = 1: i64} : () -> ()
}		}

		func @alloc() {
		// CHECK-LABEL: func @alloc()

		// CHECK: %[[m0:.*]] = gpu.alloc () : memref<13xf32, 1>
		%m0 = gpu.alloc () : memref<13xf32, 1>
		// CHECK: gpu.dealloc %[[m0]] : memref<13xf32, 1>
		gpu.dealloc %m0 : memref<13xf32, 1>

		%t0 = gpu.wait async
		// CHECK: %[[m1:.]], %[[t1:.]] = gpu.alloc async [{{.*}}] () : memref<13xf32, 1>
		%m1, %t1 = gpu.alloc async [%t0] () : memref<13xf32, 1>
		// CHECK: gpu.dealloc async [%[[t1]]] %[[m1]] : memref<13xf32, 1>
		%t2 = gpu.dealloc async [%t1] %m1 : memref<13xf32, 1>

		return
		}

func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token {		func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token {
// CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token)		// CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token)
// CHECK: return {{.*}} : !gpu.async.token		// CHECK: return {{.*}} : !gpu.async.token
return %arg0 : !gpu.async.token		return %arg0 : !gpu.async.token
}		}

func @async_wait() {		func @async_wait() {
// CHECK-LABEL: func @async_wait		// CHECK-LABEL: func @async_wait
Show All 15 Lines

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

	Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines
	extern "C" void mgpuEventSynchronize(CUevent event) {			extern "C" void mgpuEventSynchronize(CUevent event) {
	CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));			CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
	}			}

	extern "C" void mgpuEventRecord(CUevent event, CUstream stream) {			extern "C" void mgpuEventRecord(CUevent event, CUstream stream) {
	CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));			CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
	}			}

				extern "C" void mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream*/) {
				CUdeviceptr ptr;
				CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
				return reinterpret_cast<void *>(ptr);
				}

				extern "C" void mgpuMemFree(void ptr, CUstream /stream*/) {
				CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
				}

	/// Helper functions for writing mlir example code			/// Helper functions for writing mlir example code

	// Allows to register byte array with the CUDA runtime. Helpful until we have			// Allows to register byte array with the CUDA runtime. Helpful until we have
	// transfer functions implemented.			// transfer functions implemented.
	extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {			extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
	CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /flags=/0));			CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /flags=/0));
	}			}

	Show All 23 Lines

mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp

	Show First 20 Lines • Show All 102 Lines • ▼ Show 20 Lines
	extern "C" void mgpuEventSynchronize(hipEvent_t event) {			extern "C" void mgpuEventSynchronize(hipEvent_t event) {
	HIP_REPORT_IF_ERROR(hipEventSynchronize(event));			HIP_REPORT_IF_ERROR(hipEventSynchronize(event));
	}			}

	extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {			extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
	HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));			HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
	}			}

				extern "C" void mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /stream*/) {
				void *ptr;
				HIP_REPORT_IF_ERROR(hipMemAlloc(&ptr, sizeBytes));
				return ptr;
				}

				extern "C" void mgpuMemFree(void ptr, hipStream_t /stream*/) {
				HIP_REPORT_IF_ERROR(hipMemFree(ptr));
				}

	/// Helper functions for writing mlir example code			/// Helper functions for writing mlir example code

	// Allows to register byte array with the ROCM runtime. Helpful until we have			// Allows to register byte array with the ROCM runtime. Helpful until we have
	// transfer functions implemented.			// transfer functions implemented.
	extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {			extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
	HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /flags=/0));			HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /flags=/0));
	}			}

	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Add gpu.allocate, gpu.deallocate ops with LLVM lowering to runtime function calls.ClosedPublic

Details

Diff Detail

Event Timeline