This is an archive of the discontinued LLVM Phabricator instance.

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
856–857	Storing (most) state in a global variable that closes hsa in the destructor make it difficult to use raii classes to manage hsa resources. We could change to using multiple calls to hsa_init/destroy, since it's internally reference counted, and that would give us the last one to destroy closes hsa. Better is probably to nest the lifetime - would like hsa_init to occur before any other objects are constructed and hsa_shut_down after they've all been torn down. Will think about how best to represent that (separate to this patch)

Herald added a reviewer: jdoerfert. · View Herald TranscriptSep 9 2021, 8:00 AM

Herald added a subscriber: sstefan1. · View Herald Transcript

D109512 rearranges rtl.cpp so that hsa is reliably constructed before other member variables in the big global object. If we land that, this signal pool can be refactored to free the signals in the destructor again. Still some care needed to ensure it doesn't try to create any without checking hsa is available, i.e. don't prepopulate the pool in the constructor, but at least cleanup can be implicit.

Likewise could put queue in a unique_ptr or similar to reliably cleanup.

Harbormaster completed remote builds in B123224: Diff 371599.Sep 9 2021, 8:34 AM

roll back most of the patch

put queues in unique_ptr

JonChesterfield retitled this revision from [libomptarget][amdgpu] Clean up destruction of hsa queue, signals to [libomptarget][amdgpu] Destruct HSA queues on exit.Sep 9 2021, 11:58 AM

JonChesterfield edited the summary of this revision. (Show Details)

JonChesterfield retitled this revision from [libomptarget][amdgpu] Destruct HSA queues on exit to [libomptarget][amdgpu] Destruct HSA queues.

Harbormaster completed remote builds in B123281: Diff 371680.Sep 9 2021, 1:42 PM

Looks good.

This revision is now accepted and ready to land.Sep 20 2021, 5:42 AM

This revision was landed with ongoing or failed builds.Sep 26 2021, 7:34 AM

Closed by commit rG8cf93a35d4b8: [libomptarget][amdgpu] Destruct HSA queues (authored by JonChesterfield). · Explain Why

This revision was automatically updated to reflect the committed changes.

JonChesterfield added a commit: rG8cf93a35d4b8: [libomptarget][amdgpu] Destruct HSA queues.

Revision Contents

Path

Size

openmp/

libomptarget/

plugins/

amdgpu/

dynamic_hsa/

hsa.h

2 lines

hsa.cpp

1 line

src/

rtl.cpp

36 lines

Diff 375104

openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h

	Show First 20 Lines • Show All 136 Lines • ▼ Show 20 Lines

	hsa_status_t hsa_queue_create(hsa_agent_t agent, uint32_t size,			hsa_status_t hsa_queue_create(hsa_agent_t agent, uint32_t size,
	hsa_queue_type32_t type,			hsa_queue_type32_t type,
	void (*callback)(hsa_status_t status,			void (*callback)(hsa_status_t status,
	hsa_queue_t source, void data),			hsa_queue_t source, void data),
	void *data, uint32_t private_segment_size,			void *data, uint32_t private_segment_size,
	uint32_t group_segment_size, hsa_queue_t **queue);			uint32_t group_segment_size, hsa_queue_t **queue);

				hsa_status_t hsa_queue_destroy(hsa_queue_t *queue);

	uint64_t hsa_queue_load_read_index_scacquire(const hsa_queue_t *queue);			uint64_t hsa_queue_load_read_index_scacquire(const hsa_queue_t *queue);

	uint64_t hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue,			uint64_t hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue,
	uint64_t value);			uint64_t value);

	typedef enum {			typedef enum {
	HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,			HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
	} hsa_packet_type_t;			} hsa_packet_type_t;
	▲ Show 20 Lines • Show All 118 Lines • Show Last 20 Lines

openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp

	Show All 23 Lines
	DLWRAP(hsa_agent_get_info, 3);			DLWRAP(hsa_agent_get_info, 3);
	DLWRAP(hsa_iterate_agents, 2);			DLWRAP(hsa_iterate_agents, 2);
	DLWRAP(hsa_signal_create, 4);			DLWRAP(hsa_signal_create, 4);
	DLWRAP(hsa_signal_destroy, 1);			DLWRAP(hsa_signal_destroy, 1);
	DLWRAP(hsa_signal_store_relaxed, 2);			DLWRAP(hsa_signal_store_relaxed, 2);
	DLWRAP(hsa_signal_store_screlease, 2);			DLWRAP(hsa_signal_store_screlease, 2);
	DLWRAP(hsa_signal_wait_scacquire, 5);			DLWRAP(hsa_signal_wait_scacquire, 5);
	DLWRAP(hsa_queue_create, 8);			DLWRAP(hsa_queue_create, 8);
				DLWRAP(hsa_queue_destroy, 1);
	DLWRAP(hsa_queue_load_read_index_scacquire, 1);			DLWRAP(hsa_queue_load_read_index_scacquire, 1);
	DLWRAP(hsa_queue_add_write_index_relaxed, 2);			DLWRAP(hsa_queue_add_write_index_relaxed, 2);
	DLWRAP(hsa_memory_copy, 3);			DLWRAP(hsa_memory_copy, 3);
	DLWRAP(hsa_executable_create, 4);			DLWRAP(hsa_executable_create, 4);
	DLWRAP(hsa_executable_destroy, 1);			DLWRAP(hsa_executable_destroy, 1);
	DLWRAP(hsa_executable_freeze, 2);			DLWRAP(hsa_executable_freeze, 2);
	DLWRAP(hsa_executable_symbol_get_info, 3);			DLWRAP(hsa_executable_symbol_get_info, 3);
	DLWRAP(hsa_executable_iterate_symbols, 3);			DLWRAP(hsa_executable_iterate_symbols, 3);
	▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Show First 20 Lines • Show All 445 Lines • ▼ Show 20 Lines	struct HSALifetime {
}		}
};		};

/// Class containing all the device information		/// Class containing all the device information
class RTLDeviceInfoTy {		class RTLDeviceInfoTy {
HSALifetime HSA; // First field => constructed first and destructed last		HSALifetime HSA; // First field => constructed first and destructed last
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;		std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;

		struct QueueDeleter {
		void operator()(hsa_queue_t *Q) {
		if (Q) {
		hsa_status_t Err = hsa_queue_destroy(Q);
		if (Err != HSA_STATUS_SUCCESS) {
		DP("Error destroying hsa queue: %s\n", get_error_string(Err));
		}
		}
		}
		};

public:		public:
// load binary populates symbol tables and mutates various global state		// load binary populates symbol tables and mutates various global state
// run uses those symbol tables		// run uses those symbol tables
std::shared_timed_mutex load_run_lock;		std::shared_timed_mutex load_run_lock;

int NumberOfDevices = 0;		int NumberOfDevices = 0;

// GPU devices		// GPU devices
std::vector<hsa_agent_t> HSAAgents;		std::vector<hsa_agent_t> HSAAgents;
std::vector<hsa_queue_t *> HSAQueues; // one per gpu		std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
		HSAQueues; // one per gpu

// CPUs		// CPUs
std::vector<hsa_agent_t> CPUAgents;		std::vector<hsa_agent_t> CPUAgents;

// Device properties		// Device properties
std::vector<int> ComputeUnits;		std::vector<int> ComputeUnits;
std::vector<int> GroupsPerDevice;		std::vector<int> GroupsPerDevice;
std::vector<int> ThreadsPerGroup;		std::vector<int> ThreadsPerGroup;
▲ Show 20 Lines • Show All 297 Lines • ▼ Show 20 Lines	RTLDeviceInfoTy() {

err = setupMemoryPools();		err = setupMemoryPools();
if (err != HSA_STATUS_SUCCESS) {		if (err != HSA_STATUS_SUCCESS) {
DP("Error when setting up memory pools");		DP("Error when setting up memory pools");
return;		return;
}		}

for (int i = 0; i < NumberOfDevices; i++) {		for (int i = 0; i < NumberOfDevices; i++) {
HSAQueues[i] = nullptr;
}

for (int i = 0; i < NumberOfDevices; i++) {
uint32_t queue_size = 0;		uint32_t queue_size = 0;
{		{
hsa_status_t err = hsa_agent_get_info(		hsa_status_t err = hsa_agent_get_info(
HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);		HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
if (err != HSA_STATUS_SUCCESS) {		if (err != HSA_STATUS_SUCCESS) {
DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i);		DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i);
return;		return;
}		}
enum { MaxQueueSize = 4096 };		enum { MaxQueueSize = 4096 };
if (queue_size > MaxQueueSize) {		if (queue_size > MaxQueueSize) {
queue_size = MaxQueueSize;		queue_size = MaxQueueSize;
}		}
}		}

hsa_status_t rc = hsa_queue_create(		{
HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL,		hsa_queue_t *Q = nullptr;
UINT32_MAX, UINT32_MAX, &HSAQueues[i]);		hsa_status_t rc =
		hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
		callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
if (rc != HSA_STATUS_SUCCESS) {		if (rc != HSA_STATUS_SUCCESS) {
DP("Failed to create HSA queue %d\n", i);		DP("Failed to create HSA queue %d\n", i);
return;		return;
}		}
		HSAQueues[i].reset(Q);
		}

deviceStateStore[i] = {nullptr, 0};		deviceStateStore[i] = {nullptr, 0};
}		}

for (int i = 0; i < NumberOfDevices; i++) {		for (int i = 0; i < NumberOfDevices; i++) {
ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;		ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;		GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
ComputeUnits[i] = 1;		ComputeUnits[i] = 1;
Show All 26 Lines	~RTLDeviceInfoTy() {

hsa_status_t Err;		hsa_status_t Err;
for (uint32_t I = 0; I < HSAExecutables.size(); I++) {		for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
Err = hsa_executable_destroy(HSAExecutables[I]);		Err = hsa_executable_destroy(HSAExecutables[I]);
if (Err != HSA_STATUS_SUCCESS) {		if (Err != HSA_STATUS_SUCCESS) {
DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,		DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
"Destroying executable", get_error_string(Err));		"Destroying executable", get_error_string(Err));
}		}
}		}
}		}
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Storing (most) state in a global variable that closes hsa in the destructor make it difficult to use raii classes to manage hsa resources. We could change to using multiple calls to hsa_init/destroy, since it's internally reference counted, and that would give us the last one to destroy closes hsa. Better is probably to nest the lifetime - would like hsa_init to occur before any other objects are constructed and hsa_shut_down after they've all been torn down. Will think about how best to represent that (separate to this patch) JonChesterfield: Storing (most) state in a global variable that closes hsa in the destructor make it difficult…
};		};

pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;		pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;

// TODO: May need to drop the trailing to fields until deviceRTL is updated		// TODO: May need to drop the trailing to fields until deviceRTL is updated
struct omptarget_device_environmentTy {		struct omptarget_device_environmentTy {
int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG		int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
// only useful for Debug build of deviceRTLs		// only useful for Debug build of deviceRTLs
▲ Show 20 Lines • Show All 1,290 Lines • ▼ Show 20 Lines	fprintf(traceToStdout ? stdout : stderr,
device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,		device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,		arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,		group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
vgpr_spill_count, loop_tripcount, KernelInfo->Name);		vgpr_spill_count, loop_tripcount, KernelInfo->Name);
}		}

// Run on the device.		// Run on the device.
{		{
hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id];		hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
if (!queue) {		if (!queue) {
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}
uint64_t packet_id = acquire_available_packet_id(queue);		uint64_t packet_id = acquire_available_packet_id(queue);

const uint32_t mask = queue->size - 1; // size is a power of 2		const uint32_t mask = queue->size - 1; // size is a power of 2
hsa_kernel_dispatch_packet_t *packet =		hsa_kernel_dispatch_packet_t *packet =
(hsa_kernel_dispatch_packet_t *)queue->base_address +		(hsa_kernel_dispatch_packet_t *)queue->base_address +
▲ Show 20 Lines • Show All 151 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget][amdgpu] Destruct HSA queuesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 375104

openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h

openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

[libomptarget][amdgpu] Destruct HSA queues
ClosedPublic