This is an archive of the discontinued LLVM Phabricator instance.

streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
130	Hm. This makes a copy of "Code" in the map. And also, every time we do a lookup, we're going to have to compare the whole PTX strings. Which are potentially very long. Is there no other identifier we could use as the map key?

jhen added inline comments.Sep 15 2016, 2:10 PM

streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp
130	Unfortunately, I don't think there is currently any other identifier that won't ever lead to false matches. Using the whole string as the key is what the original developers did because they couldn't find a better solution, so I'm mostly just following their lead here. There are some things we could do with randomly generated UUIDs that would work for all practical purposes, but I don't want to worry about correctly generating UUIDs. I also have the following idea that seems a little complex. Does it seem too complex to you?: Use a static integer with atomic increments (or mutex or whatever) to give a unique ID to each MultiKernelLoaderSpec instance in the process and then have each instance assign a unique ID to each piece of code that is registered with it. The pair of `MultiKernelLoaderSpec` ID and code ID will uniquely identify a piece of code and can be used as a key in the module cache.

We've decided to come at this problem from a different angle, so I'm abandoning this revision.

Revision Contents

Path

Size

streamexecutor/

include/

streamexecutor/

platforms/

cuda/

CUDAPlatformDevice.h

9 lines

lib/

platforms/

cuda/

CUDAPlatformDevice.cpp

32 lines

Diff 71536

streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h

Show All 11 Lines
///		///
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H		#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H		#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H

#include "streamexecutor/PlatformDevice.h"		#include "streamexecutor/PlatformDevice.h"

		#include "llvm/Support/Mutex.h"

		#include <map>

		struct CUfunc_st;
		struct CUmod_st;

namespace streamexecutor {		namespace streamexecutor {
namespace cuda {		namespace cuda {

Error CUresultToError(int CUResult, const llvm::Twine &Message);		Error CUresultToError(int CUResult, const llvm::Twine &Message);

class CUDAPlatformDevice : public PlatformDevice {		class CUDAPlatformDevice : public PlatformDevice {
public:		public:
static Expected<CUDAPlatformDevice> create(size_t DeviceIndex);		static Expected<CUDAPlatformDevice> create(size_t DeviceIndex);
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	public:
Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset,		Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset,
const void *DeviceSrcHandle, size_t SrcByteOffset,		const void *DeviceSrcHandle, size_t SrcByteOffset,
size_t ByteCount) override;		size_t ByteCount) override;

private:		private:
CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {}		CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {}

int DeviceIndex;		int DeviceIndex;
		llvm::sys::Mutex Mutex;
		std::map<std::string, std::pair<CUmod_st , CUfunc_st >> LoadedModules;
};		};

} // namespace cuda		} // namespace cuda
} // namespace streamexecutor		} // namespace streamexecutor

#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H		#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H

streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp

Show First 20 Lines • Show All 84 Lines • ▼ Show 20 Lines
CUDAPlatformDevice::~CUDAPlatformDevice() {		CUDAPlatformDevice::~CUDAPlatformDevice() {
CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);		CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
(void)Result;		(void)Result;
// TODO(jhen): Log error.		// TODO(jhen): Log error.
}		}

Expected<const void *>		Expected<const void *>
CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {		CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
// TODO(jhen): Maybe first check loaded modules?
if (!Spec.hasCUDAPTXInMemory())		if (!Spec.hasCUDAPTXInMemory())
return make_error("no CUDA code available to create kernel");		return make_error("no CUDA code available to create kernel");

CUdevice Device = static_cast<int>(DeviceIndex);		CUdevice Device = static_cast<int>(DeviceIndex);
int ComputeCapabilityMajor = 0;		int ComputeCapabilityMajor = 0;
int ComputeCapabilityMinor = 0;		int ComputeCapabilityMinor = 0;
if (CUresult Result = cuDeviceGetAttribute(		if (CUresult Result = cuDeviceGetAttribute(
&ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,		&ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
Show All 10 Lines	CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,		const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
ComputeCapabilityMinor);		ComputeCapabilityMinor);

if (!Code)		if (!Code)
return make_error("no suitable CUDA source found for compute capability " +		return make_error("no suitable CUDA source found for compute capability " +
llvm::Twine(ComputeCapabilityMajor) + "." +		llvm::Twine(ComputeCapabilityMajor) + "." +
llvm::Twine(ComputeCapabilityMinor));		llvm::Twine(ComputeCapabilityMinor));

CUmodule Module;		CUfunction Function = nullptr;
		{
		llvm::sys::ScopedLock Lock(Mutex);
		auto Iterator = LoadedModules.find(Code);
		if (Iterator == LoadedModules.end()) {
		CUmodule Module = nullptr;
if (CUresult Result = cuModuleLoadData(&Module, Code))		if (CUresult Result = cuModuleLoadData(&Module, Code))
return CUresultToError(Result, "cuModuleLoadData");		return CUresultToError(Result, "cuModuleLoadData");
		if (CUresult Result = cuModuleGetFunction(&Function, Module,
CUfunction Function;		Spec.getKernelName().c_str()))
if (CUresult Result =
cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
return CUresultToError(Result, "cuModuleGetFunction");		return CUresultToError(Result, "cuModuleGetFunction");
		LoadedModules.emplace(Code, std::make_pair(Module, Function));
		jlebarUnsubmitted Not Done Reply Inline Actions Hm. This makes a copy of "Code" in the map. And also, every time we do a lookup, we're going to have to compare the whole PTX strings. Which are potentially very long. Is there no other identifier we could use as the map key? jlebar: Hm. This makes a copy of "Code" in the map. And also, every time we do a lookup, we're going…
		jhenAuthorUnsubmitted Not Done Reply Inline Actions Unfortunately, I don't think there is currently any other identifier that won't ever lead to false matches. Using the whole string as the key is what the original developers did because they couldn't find a better solution, so I'm mostly just following their lead here. There are some things we could do with randomly generated UUIDs that would work for all practical purposes, but I don't want to worry about correctly generating UUIDs. I also have the following idea that seems a little complex. Does it seem too complex to you?: Use a static integer with atomic increments (or mutex or whatever) to give a unique ID to each MultiKernelLoaderSpec instance in the process and then have each instance assign a unique ID to each piece of code that is registered with it. The pair of `MultiKernelLoaderSpec` ID and code ID will uniquely identify a piece of code and can be used as a key in the module cache. jhen: Unfortunately, I don't think there is currently any other identifier that won't ever lead to…
// TODO(jhen): Should I save this function pointer in case someone asks for		} else
// it again?		Function = Iterator->second.second;
		}
// TODO(jhen): Should I save the module pointer so I can unload it when I
// destroy this device?

return static_cast<const void *>(Function);		return static_cast<const void *>(Function);
}		}

Error CUDAPlatformDevice::destroyKernel(const void *Handle) {		Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
// TODO(jhen): Maybe keep track of kernels for each module and unload the
// module after they are all destroyed.
return Error::success();		return Error::success();
}		}

Expected<const void *> CUDAPlatformDevice::createStream() {		Expected<const void *> CUDAPlatformDevice::createStream() {
CUstream Stream;		CUstream Stream;
if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))		if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
return CUresultToError(Result, "cuStreamCreate");		return CUresultToError(Result, "cuStreamCreate");
return Stream;		return Stream;
▲ Show 20 Lines • Show All 159 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SE] Cache CUDA modulesAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 71536

streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h

streamexecutor/lib/platforms/cuda/CUDAPlatformDevice.cpp

[SE] Cache CUDA modules
AbandonedPublic