This is an archive of the discontinued LLVM Phabricator instance.

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
684	For identity layout (verified on line 672), stride[0]*size[0] gives the correct number of elements (stride[0] is 'product(size[1..n-1])'). `getMemrefDescriptorSizes` is not the right API here, you would first need to extract the dynamic sizes from the struct.

herhut accepted this revision.Dec 22 2020, 2:16 AM

herhut added inline comments.

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
684	Ah, this is subtle. Can you leave a comment so I understand this next time round, as well? Especially as the meaning of `isSupportedMemRefType` is not obvious here.

This revision is now accepted and ready to land.Dec 22 2020, 2:16 AM

Add comment.

csigg added inline comments.Dec 22 2020, 8:44 AM

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
684	Added comment. I've been wanting to rename `isSupportedMemRefType`. Will do in a separate revision.

Harbormaster completed remote builds in B83289: Diff 313358.Dec 22 2020, 9:11 AM

Rebase.

Harbormaster completed remote builds in B83311: Diff 313403.Dec 22 2020, 12:06 PM

Closed by commit rGdf6cbd37f57f: [mlir] Lower gpu.memcpy to GPU runtime calls. (authored by csigg). · Explain WhyDec 22 2020, 1:49 PM

This revision was automatically updated to reflect the committed changes.

csigg added a commit: rGdf6cbd37f57f: [mlir] Lower gpu.memcpy to GPU runtime calls..

Revision Contents

Path

Size

mlir/

lib/

Conversion/

GPUCommon/

ConvertLaunchFuncToRuntimeCalls.cpp

65 lines

test/

Conversion/

GPUCommon/

lower-memcpy-to-gpu-runtime-calls.mlir

19 lines

tools/

mlir-cuda-runner/

cuda-runtime-wrappers.cpp

7 lines

mlir-rocm-runner/

rocm-runtime-wrappers.cpp

5 lines

Diff 313426

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

Show First 20 Lines • Show All 145 Lines • ▼ Show 20 Lines	FunctionCallBuilder allocCallBuilder = {
"mgpuMemAlloc",		"mgpuMemAlloc",
llvmPointerType /* void * */,		llvmPointerType /* void * */,
{llvmIntPtrType /* intptr_t sizeBytes */,		{llvmIntPtrType /* intptr_t sizeBytes */,
llvmPointerType /* void stream /}};		llvmPointerType /* void stream /}};
FunctionCallBuilder deallocCallBuilder = {		FunctionCallBuilder deallocCallBuilder = {
"mgpuMemFree",		"mgpuMemFree",
llvmVoidType,		llvmVoidType,
{llvmPointerType /* void ptr /, llvmPointerType /* void stream /}};		{llvmPointerType /* void ptr /, llvmPointerType /* void stream /}};
		FunctionCallBuilder memcpyCallBuilder = {
		"mgpuMemcpy",
		llvmVoidType,
		{llvmPointerType /* void dst /, llvmPointerType /* void src /,
		llvmIntPtrType /* intptr_t sizeBytes */,
		llvmPointerType /* void stream /}};
};		};

/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime		/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).		/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertHostRegisterOpToGpuRuntimeCallPattern		class ConvertHostRegisterOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {		: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
public:		public:
ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)		ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
LogicalResult matchAndRewrite(gpu::GPUModuleOp op,		LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
// GPU kernel modules are no longer necessary since we have a global		// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN, or HSACO data.		// constant with the CUBIN, or HSACO data.
rewriter.eraseOp(op);		rewriter.eraseOp(op);
return success();		return success();
}		}
};		};

		/// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
		/// call. Currently it supports CUDA and ROCm (HIP).
		class ConvertMemcpyOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> {
		public:
		ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
		ConversionPatternRewriter &rewriter) const override;
		};
} // namespace		} // namespace

void GpuToLLVMConversionPass::runOnOperation() {		void GpuToLLVMConversionPass::runOnOperation() {
LLVMTypeConverter converter(&getContext());		LLVMTypeConverter converter(&getContext());
OwningRewritePatternList patterns;		OwningRewritePatternList patterns;
populateStdToLLVMConversionPatterns(converter, patterns);		populateStdToLLVMConversionPatterns(converter, patterns);
populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);		populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);

▲ Show 20 Lines • Show All 359 Lines • ▼ Show 20 Lines	if (launchOp.asyncToken()) {
streamDestroyCallBuilder.create(loc, rewriter, stream);		streamDestroyCallBuilder.create(loc, rewriter, stream);
rewriter.eraseOp(launchOp);		rewriter.eraseOp(launchOp);
}		}
moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));		moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));

return success();		return success();
}		}

		LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
		ConversionPatternRewriter &rewriter) const {
		auto memRefType = memcpyOp.src().getType().cast<MemRefType>();

		if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) \|\|
		!isSupportedMemRefType(memRefType) \|\|
		failed(isAsyncWithOneDependency(rewriter, memcpyOp)))
		return failure();

		auto loc = memcpyOp.getLoc();
		auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary());

		MemRefDescriptor srcDesc(adaptor.src());

		Value numElements =
		memRefType.hasStaticShape()
		? createIndexConstant(rewriter, loc, memRefType.getNumElements())
		// For identity layouts (verified above), the number of elements is
		herhutUnsubmitted Not Done Reply Inline Actions This would only support vectors? Maybe use `getMemrefDescriptorSizes` from the LLVM lowering to compute the actual size? herhut: This would only support vectors? Maybe use `getMemrefDescriptorSizes` from the LLVM lowering to…
		csiggAuthorUnsubmitted Done Reply Inline Actions For identity layout (verified on line 672), stride[0]size[0] gives the correct number of elements (stride[0] is 'product(size[1..n-1])'). `getMemrefDescriptorSizes` is not the right API here, you would first need to extract the dynamic sizes from the struct. csigg:* For identity layout (verified on line 672), stride[0]*size[0] gives the correct number of…
		herhutUnsubmitted Not Done Reply Inline Actions Ah, this is subtle. Can you leave a comment so I understand this next time round, as well? Especially as the meaning of `isSupportedMemRefType` is not obvious here. herhut: Ah, this is subtle. Can you leave a comment so I understand this next time round, as well?
		csiggAuthorUnsubmitted Done Reply Inline Actions Added comment. I've been wanting to rename `isSupportedMemRefType`. Will do in a separate revision. csigg: Added comment. I've been wanting to rename `isSupportedMemRefType`. Will do in a separate…
		// stride[0] * size[0].
		: rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0),
		srcDesc.size(rewriter, loc, 0));

		Type elementPtrType = getElementPtrType(memRefType);
		Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
		Value gepPtr = rewriter.create<LLVM::GEPOp>(
		loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements});
		auto sizeBytes =
		rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);

		auto src = rewriter.create<LLVM::BitcastOp>(
		loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc));
		auto dst = rewriter.create<LLVM::BitcastOp>(
		loc, llvmPointerType,
		MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc));

		auto stream = adaptor.asyncDependencies().front();
		memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream});

		rewriter.replaceOp(memcpyOp, {stream});

		return success();
		}

std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>		std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {		mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);		return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
}		}

void mlir::populateGpuToLLVMConversionPatterns(		void mlir::populateGpuToLLVMConversionPatterns(
LLVMTypeConverter &converter, OwningRewritePatternList &patterns,		LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
StringRef gpuBinaryAnnotation) {		StringRef gpuBinaryAnnotation) {
converter.addConversion(		converter.addConversion(
[context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {		[context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
return LLVM::LLVMType::getInt8PtrTy(context);		return LLVM::LLVMType::getInt8PtrTy(context);
});		});
patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,		patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,		ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,		ConvertHostRegisterOpToGpuRuntimeCallPattern,
		ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,		ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern>(converter);		ConvertWaitOpToGpuRuntimeCallPattern>(converter);
patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(		patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation);		converter, gpuBinaryAnnotation);
patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());		patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
}		}

mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir

This file was added.

				// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm \| FileCheck %s

				module attributes {gpu.container_module} {

				// CHECK: func @foo
				func @foo(%dst : memref<7xf32, 1>, %src : memref<7xf32>) {
				// CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
				%t0 = gpu.wait async
				// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint
				// CHECK: %[[src:.*]] = llvm.bitcast
				// CHECK: %[[dst:.*]] = llvm.bitcast
				// CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]])
				%t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32>
				// CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
				// CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
				gpu.wait [%t1]
				return
				}
				}

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

Show First 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	extern "C" void mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream*/) {
CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));		CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
return reinterpret_cast<void *>(ptr);		return reinterpret_cast<void *>(ptr);
}		}

extern "C" void mgpuMemFree(void ptr, CUstream /stream*/) {		extern "C" void mgpuMemFree(void ptr, CUstream /stream*/) {
CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));		CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
}		}

		extern "C" void mgpuMemcpy(void dst, void src, uint64_t sizeBytes,
		CUstream stream) {
		CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
		reinterpret_cast<CUdeviceptr>(src),
		sizeBytes, stream));
		}

/// Helper functions for writing mlir example code		/// Helper functions for writing mlir example code

// Allows to register byte array with the CUDA runtime. Helpful until we have		// Allows to register byte array with the CUDA runtime. Helpful until we have
// transfer functions implemented.		// transfer functions implemented.
extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {		extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /flags=/0));		CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /flags=/0));
}		}

Show All 23 Lines

mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp

Show First 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	extern "C" void mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /stream*/) {
HIP_REPORT_IF_ERROR(hipMemAlloc(&ptr, sizeBytes));		HIP_REPORT_IF_ERROR(hipMemAlloc(&ptr, sizeBytes));
return ptr;		return ptr;
}		}

extern "C" void mgpuMemFree(void ptr, hipStream_t /stream*/) {		extern "C" void mgpuMemFree(void ptr, hipStream_t /stream*/) {
HIP_REPORT_IF_ERROR(hipMemFree(ptr));		HIP_REPORT_IF_ERROR(hipMemFree(ptr));
}		}

		extern "C" void mgpuMemcpy(void dst, void src, uint64_t sizeBytes,
		hipStream_t stream) {
		HIP_REPORT_IF_ERROR(hipMemcpyAsync(dst, src, sizeBytes, stream));
		}

/// Helper functions for writing mlir example code		/// Helper functions for writing mlir example code

// Allows to register byte array with the ROCM runtime. Helpful until we have		// Allows to register byte array with the ROCM runtime. Helpful until we have
// transfer functions implemented.		// transfer functions implemented.
extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {		extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /flags=/0));		HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /flags=/0));
}		}

▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Lower gpu.memcpy to GPU runtime calls.ClosedPublic

Details

Diff Detail