Diff 316084

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

Show All 27 Lines	[](CUresult result) { \
const char *name = nullptr; \		const char *name = nullptr; \
cuGetErrorName(result, &name); \		cuGetErrorName(result, &name); \
if (!name) \		if (!name) \
name = "<unknown>"; \		name = "<unknown>"; \
llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \		llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \
}(expr)		}(expr)

// Static initialization of CUDA context for device ordinal 0.		// Static initialization of CUDA context for device ordinal 0.
static auto InitializeCtx = [] {		static auto Context = [] {
CUDA_REPORT_IF_ERROR(cuInit(/flags=/0));		CUDA_REPORT_IF_ERROR(cuInit(/flags=/0));
CUdevice device;		CUdevice device;
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/0));		CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/0));
CUcontext context;		CUcontext context;
CUDA_REPORT_IF_ERROR(cuCtxCreate(&context, /flags=/0, device));		CUDA_REPORT_IF_ERROR(cuCtxCreate(&context, /flags=/0, device));
return 0;		return context;
}();		}();

extern "C" CUmodule mgpuModuleLoad(void *data) {		extern "C" CUmodule mgpuModuleLoad(void *data) {
		herhutUnsubmitted Not Done Reply Inline Actions Creating it always as before would make this less complex. What is the drawback? herhut: Creating it always as before would make this less complex. What is the drawback?
		csiggAuthorUnsubmitted Done Reply Inline Actions Setting a specific context allows running on a different device, for example. The use is quite limited though because mgpuSetContext() is not thread safe. We will probably need to expose the per-thread context per thread, or per function that needs one. I switched it to the primary context, which is the simplest. csigg: Setting a specific context allows running on a different device, for example. The use is quite…
		CUDA_REPORT_IF_ERROR(cuCtxSetCurrent(Context));
		herhutUnsubmitted Not Done Reply Inline Actions Should this rather use push/pop in case there is some external (to the gpu dialect) use of the context, too? Like if this runs inside of some other runtime. herhut: Should this rather use push/pop in case there is some external (to the gpu dialect) use of the…
		csiggAuthorUnsubmitted Done Reply Inline Actions It certainly could, but it seems a little over-engineered at this stage. But happy to add it if you think it makes sense. csigg: It certainly could, but it seems a little over-engineered at this stage. But happy to add it if…
		herhutUnsubmitted Not Done Reply Inline Actions CUDA context issues are annoying to debug and why not if we can avoid creating that issue. I will forget this and then be puzzled :) herhut: CUDA context issues are annoying to debug and why not if we can avoid creating that issue. I…
CUmodule module = nullptr;		CUmodule module = nullptr;
CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));		CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
return module;		return module;
}		}

extern "C" void mgpuModuleUnload(CUmodule module) {		extern "C" void mgpuModuleUnload(CUmodule module) {
		herhutUnsubmitted Not Done Reply Inline Actions This might no longer be the current one, if it was just created. herhut: This might no longer be the current one, if it was just created.
		csiggAuthorUnsubmitted Done Reply Inline Actions See comment below. csigg: See comment below.
CUDA_REPORT_IF_ERROR(cuModuleUnload(module));		CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
}		}

extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) {		extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) {
		herhutUnsubmitted Not Done Reply Inline Actions Why not use `cuCtxPopCurrent` here? herhut: Why not use `cuCtxPopCurrent` here?
		csiggAuthorUnsubmitted Done Reply Inline Actions The CUDA context stack is from early CUDA days. I have not seen anyone using it in years, and the HIP equivalent is marked deprecated. csigg: The CUDA context stack is from early CUDA days. I have not seen anyone using it in years, and…
CUfunction function = nullptr;		CUfunction function = nullptr;
CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));		CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
return function;		return function;
}		}

// The wrapper uses intptr_t instead of CUDA's unsigned int to match		// The wrapper uses intptr_t instead of CUDA's unsigned int to match
// the type of MLIR's index type. This avoids the need for casts in the		// the type of MLIR's index type. This avoids the need for casts in the
// generated MLIR code.		// generated MLIR code.
extern "C" void mgpuLaunchKernel(CUfunction function, intptr_t gridX,		extern "C" void mgpuLaunchKernel(CUfunction function, intptr_t gridX,
intptr_t gridY, intptr_t gridZ,		intptr_t gridY, intptr_t gridZ,
intptr_t blockX, intptr_t blockY,		intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem, CUstream stream,		intptr_t blockZ, int32_t smem, CUstream stream,
void params, void extra) {		void params, void extra) {
		herhutUnsubmitted Not Done Reply Inline Actions Doesn't `cuCtxCreate` already do this? herhut: Doesn't `cuCtxCreate` already do this?
		csiggAuthorUnsubmitted Done Reply Inline Actions cuCtxCreate sets the current context, this restores it so that the c'tor can grab it. It's a bit of a back and forth, but there is no call_once-else. csigg: cuCtxCreate sets the current context, this restores it so that the c'tor can grab it. It's a…
		CUDA_REPORT_IF_ERROR(cuCtxSetCurrent(Context));
CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,		CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
blockY, blockZ, smem, stream, params,		blockY, blockZ, smem, stream, params,
extra));		extra));
}		}

extern "C" CUstream mgpuStreamCreate() {		extern "C" CUstream mgpuStreamCreate() {
		CUDA_REPORT_IF_ERROR(cuCtxSetCurrent(Context));
CUstream stream = nullptr;		CUstream stream = nullptr;
CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));		CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
return stream;		return stream;
}		}

extern "C" void mgpuStreamDestroy(CUstream stream) {		extern "C" void mgpuStreamDestroy(CUstream stream) {
CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));		CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
}		}

extern "C" void mgpuStreamSynchronize(CUstream stream) {		extern "C" void mgpuStreamSynchronize(CUstream stream) {
CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));		CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
}		}

extern "C" void mgpuStreamWaitEvent(CUstream stream, CUevent event) {		extern "C" void mgpuStreamWaitEvent(CUstream stream, CUevent event) {
CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /flags=/0));		CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /flags=/0));
}		}

extern "C" CUevent mgpuEventCreate() {		extern "C" CUevent mgpuEventCreate() {
		CUDA_REPORT_IF_ERROR(cuCtxSetCurrent(Context));
CUevent event = nullptr;		CUevent event = nullptr;
CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));		CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
return event;		return event;
}		}

extern "C" void mgpuEventDestroy(CUevent event) {		extern "C" void mgpuEventDestroy(CUevent event) {
CUDA_REPORT_IF_ERROR(cuEventDestroy(event));		CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
}		}

extern "C" void mgpuEventSynchronize(CUevent event) {		extern "C" void mgpuEventSynchronize(CUevent event) {
CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));		CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
}		}

extern "C" void mgpuEventRecord(CUevent event, CUstream stream) {		extern "C" void mgpuEventRecord(CUevent event, CUstream stream) {
CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));		CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
}		}

extern "C" void mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream*/) {		extern "C" void mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream*/) {
		CUDA_REPORT_IF_ERROR(cuCtxSetCurrent(Context));
CUdeviceptr ptr;		CUdeviceptr ptr;
CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));		CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
return reinterpret_cast<void *>(ptr);		return reinterpret_cast<void *>(ptr);
}		}

extern "C" void mgpuMemFree(void ptr, CUstream /stream*/) {		extern "C" void mgpuMemFree(void ptr, CUstream /stream*/) {
CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));		CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
}		}
Show All 39 Lines

mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp

Show All 26 Lines	if (!result) \
return; \		return; \
const char *name = hipGetErrorName(result); \		const char *name = hipGetErrorName(result); \
if (!name) \		if (!name) \
name = "<unknown>"; \		name = "<unknown>"; \
llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \		llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \
}(expr)		}(expr)

// Static initialization of HIP context for device ordinal 0.		// Static initialization of HIP context for device ordinal 0.
static auto InitializeCtx = [] {		static auto Context = [] {
HIP_REPORT_IF_ERROR(hipInit(/flags=/0));		HIP_REPORT_IF_ERROR(hipInit(/flags=/0));
hipDevice_t device;		hipDevice_t device;
HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /ordinal=/0));		HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /ordinal=/0));
hipContext_t context;		hipContext_t context;
HIP_REPORT_IF_ERROR(hipCtxCreate(&context, /flags=/0, device));		HIP_REPORT_IF_ERROR(hipCtxCreate(&context, /flags=/0, device));
return 0;		return context;
}();		}();

extern "C" hipModule_t mgpuModuleLoad(void *data) {		extern "C" hipModule_t mgpuModuleLoad(void *data) {
		HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context));
hipModule_t module = nullptr;		hipModule_t module = nullptr;
HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data));		HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data));
return module;		return module;
}		}
		csiggAuthorUnsubmitted Done Reply Inline Actions This should say `hipCtxGetCurrent` will fix later. csigg: This should say `hipCtxGetCurrent` will fix later.

		herhutUnsubmitted Not Done Reply Inline Actions `context` -> `Context` herhut: `context` -> `Context`
extern "C" void mgpuModuleUnload(hipModule_t module) {		extern "C" void mgpuModuleUnload(hipModule_t module) {
HIP_REPORT_IF_ERROR(hipModuleUnload(module));		HIP_REPORT_IF_ERROR(hipModuleUnload(module));
}		}

extern "C" hipFunction_t mgpuModuleGetFunction(hipModule_t module,		extern "C" hipFunction_t mgpuModuleGetFunction(hipModule_t module,
const char *name) {		const char *name) {
hipFunction_t function = nullptr;		hipFunction_t function = nullptr;
HIP_REPORT_IF_ERROR(hipModuleGetFunction(&function, module, name));		HIP_REPORT_IF_ERROR(hipModuleGetFunction(&function, module, name));
return function;		return function;
}		}

// The wrapper uses intptr_t instead of ROCM's unsigned int to match		// The wrapper uses intptr_t instead of ROCM's unsigned int to match
// the type of MLIR's index type. This avoids the need for casts in the		// the type of MLIR's index type. This avoids the need for casts in the
// generated MLIR code.		// generated MLIR code.
extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,		extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
intptr_t gridY, intptr_t gridZ,		intptr_t gridY, intptr_t gridZ,
intptr_t blockX, intptr_t blockY,		intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem,		intptr_t blockZ, int32_t smem,
hipStream_t stream, void **params,		hipStream_t stream, void **params,
void **extra) {		void **extra) {
		HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context));
HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,		HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
blockX, blockY, blockZ, smem,		blockX, blockY, blockZ, smem,
stream, params, extra));		stream, params, extra));
}		}

extern "C" hipStream_t mgpuStreamCreate() {		extern "C" hipStream_t mgpuStreamCreate() {
		HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context));
hipStream_t stream = nullptr;		hipStream_t stream = nullptr;
HIP_REPORT_IF_ERROR(hipStreamCreate(&stream));		HIP_REPORT_IF_ERROR(hipStreamCreate(&stream));
return stream;		return stream;
}		}

extern "C" void mgpuStreamDestroy(hipStream_t stream) {		extern "C" void mgpuStreamDestroy(hipStream_t stream) {
HIP_REPORT_IF_ERROR(hipStreamDestroy(stream));		HIP_REPORT_IF_ERROR(hipStreamDestroy(stream));
}		}

extern "C" void mgpuStreamSynchronize(hipStream_t stream) {		extern "C" void mgpuStreamSynchronize(hipStream_t stream) {
return HIP_REPORT_IF_ERROR(hipStreamSynchronize(stream));		return HIP_REPORT_IF_ERROR(hipStreamSynchronize(stream));
}		}

extern "C" void mgpuStreamWaitEvent(hipStream_t stream, hipEvent_t event) {		extern "C" void mgpuStreamWaitEvent(hipStream_t stream, hipEvent_t event) {
HIP_REPORT_IF_ERROR(hipStreamWaitEvent(stream, event, /flags=/0));		HIP_REPORT_IF_ERROR(hipStreamWaitEvent(stream, event, /flags=/0));
}		}

extern "C" hipEvent_t mgpuEventCreate() {		extern "C" hipEvent_t mgpuEventCreate() {
		HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context));
hipEvent_t event = nullptr;		hipEvent_t event = nullptr;
HIP_REPORT_IF_ERROR(hipEventCreateWithFlags(&event, hipEventDisableTiming));		HIP_REPORT_IF_ERROR(hipEventCreateWithFlags(&event, hipEventDisableTiming));
return event;		return event;
}		}

extern "C" void mgpuEventDestroy(hipEvent_t event) {		extern "C" void mgpuEventDestroy(hipEvent_t event) {
HIP_REPORT_IF_ERROR(hipEventDestroy(event));		HIP_REPORT_IF_ERROR(hipEventDestroy(event));
}		}

extern "C" void mgpuEventSynchronize(hipEvent_t event) {		extern "C" void mgpuEventSynchronize(hipEvent_t event) {
HIP_REPORT_IF_ERROR(hipEventSynchronize(event));		HIP_REPORT_IF_ERROR(hipEventSynchronize(event));
}		}

extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {		extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));		HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
}		}

extern "C" void mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /stream*/) {		extern "C" void mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /stream*/) {
		HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context));
void *ptr;		void *ptr;
HIP_REPORT_IF_ERROR(hipMemAlloc(&ptr, sizeBytes));		HIP_REPORT_IF_ERROR(hipMemAlloc(&ptr, sizeBytes));
return ptr;		return ptr;
}		}

extern "C" void mgpuMemFree(void ptr, hipStream_t /stream*/) {		extern "C" void mgpuMemFree(void ptr, hipStream_t /stream*/) {
HIP_REPORT_IF_ERROR(hipMemFree(ptr));		HIP_REPORT_IF_ERROR(hipMemFree(ptr));
}		}
▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Set CUDA/ROCm context before creating resources.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 316084

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Set CUDA/ROCm context before creating resources.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 316084

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp

[mlir] Set CUDA/ROCm context before creating resources.
ClosedPublic