This is an archive of the discontinued LLVM Phabricator instance.

[mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute
ClosedPublic

Authored by guraypp on Aug 2 2023, 3:57 AM.

Download Raw Diff

Details

Reviewers

aartbik
qcolombet
nicolasvasilache

Commits

rG53881490c2ed: [mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute

Summary

This works aims to address the issue related to larger shared memory usage in the MLIR CUDA runtime. Currently, when the shared memory usage exceeds 48KB, we need to set the CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES attribute of the CUDA kernel appropriately. This work takes care of that by setting the attribute as required. Additionally, it includes some debug prints for better visibility and troubleshooting.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

guraypp created this revision.Aug 2 2023, 3:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 2 2023, 3:57 AM

Herald added subscribers: bviyer, Moerafaat, zero9178 and 23 others. · View Herald Transcript

guraypp requested review of this revision.Aug 2 2023, 3:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 2 2023, 3:57 AM

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

guraypp added reviewers: aartbik, qcolombet, nicolasvasilache.Aug 2 2023, 3:58 AM

Harbormaster completed remote builds in B249722: Diff 546394.Aug 2 2023, 4:45 AM

nicolasvasilache accepted this revision.Aug 2 2023, 4:58 AM

This revision is now accepted and ready to land.Aug 2 2023, 4:58 AM

Closed by commit rG53881490c2ed: [mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute (authored by guraypp). · Explain WhyAug 2 2023, 5:19 AM

This revision was automatically updated to reflect the committed changes.

guraypp added a commit: rG53881490c2ed: [mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute.

Revision Contents

Path

Size

mlir/

lib/

ExecutionEngine/

CudaRuntimeWrappers.cpp

30 lines

Diff 546420

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

	Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines

	#define debug_print(fmt, ...) \			#define debug_print(fmt, ...) \
	do { \			do { \
	if (isDebugEnabled()) \			if (isDebugEnabled()) \
	fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \			fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \
	__func__, __VA_ARGS__); \			__func__, __VA_ARGS__); \
	} while (0)			} while (0)

				// Returns default CUdevice
				CUdevice getDefaultCuDevice() {
				CUdevice device;
				CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/defaultDevice));
				return device;
				}

	// Make the primary context of the current default device current for the			// Make the primary context of the current default device current for the
	// duration			// duration
	// of the instance and restore the previous context on destruction.			// of the instance and restore the previous context on destruction.
	class ScopedContext {			class ScopedContext {
	public:			public:
	ScopedContext() {			ScopedContext() {
	// Static reference to CUDA primary context for device ordinal			// Static reference to CUDA primary context for device ordinal
	// defaultDevice.			// defaultDevice.
	static CUcontext context = [] {			static CUcontext context = [] {
	CUDA_REPORT_IF_ERROR(cuInit(/flags=/0));			CUDA_REPORT_IF_ERROR(cuInit(/flags=/0));
	CUdevice device;
	CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/defaultDevice));
	CUcontext ctx;			CUcontext ctx;
	// Note: this does not affect the current context.			// Note: this does not affect the current context.
	CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));			CUDA_REPORT_IF_ERROR(
				cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
	return ctx;			return ctx;
	}();			}();

	CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));			CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
	}			}

	~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }			~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
	};			};
	Show All 36 Lines
	// the type of MLIR's index type. This avoids the need for casts in the			// the type of MLIR's index type. This avoids the need for casts in the
	// generated MLIR code.			// generated MLIR code.
	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void			extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
	mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,			mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
	intptr_t gridZ, intptr_t blockX, intptr_t blockY,			intptr_t gridZ, intptr_t blockX, intptr_t blockY,
	intptr_t blockZ, int32_t smem, CUstream stream, void **params,			intptr_t blockZ, int32_t smem, CUstream stream, void **params,
	void **extra) {			void **extra) {
	ScopedContext scopedContext;			ScopedContext scopedContext;
				int32_t maxShmem = 0;
				CUdevice device = getDefaultCuDevice();
				CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/defaultDevice));
				CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
				&maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
				device));
				if (maxShmem < smem) {
				fprintf(stderr,
				"Requested shared memory (%dkb) is larger than maximum allowed "
				"shared memory (%dkb) for this device\n",
				smem, maxShmem);
				}
				CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
				function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
				debug_print("Launching kernel, grid=%ld,%ld,%ld, "
				"threads: %ld, %ld, %ld, "
				"smem: %dkb\n",
				gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
	CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,			CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
	blockY, blockZ, smem, stream, params,			blockY, blockZ, smem, stream, params,
	extra));			extra));
	}			}

	extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {			extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
	ScopedContext scopedContext;			ScopedContext scopedContext;
	CUstream stream = nullptr;			CUstream stream = nullptr;
	▲ Show 20 Lines • Show All 616 Lines • Show Last 20 Lines