This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add multiple hsa queue's per device in plugin
ClosedPublic

Authored by carlo.bertolli on Dec 14 2021, 5:47 PM.

Download Raw Diff

Details

Reviewers

JonChesterfield
gregrodgers
ronl
grokos
dpalermo
jdoerfert

Commits

rGd83dc4c64814: [OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add…

Summary

This patch extends the AMDGPU plugin for OpenMP target offloading from using a single HSA queue to multiple queues (four in this patch) per device. This enables concurrent threads to concurrently submit kernel launches to the same GPU.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

carlo.bertolli created this revision.Dec 14 2021, 5:47 PM

Herald added subscribers: kerbowa, guansong, t-tye and 6 others. · View Herald TranscriptDec 14 2021, 5:47 PM

carlo.bertolli requested review of this revision.Dec 14 2021, 5:47 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptDec 14 2021, 5:47 PM

Herald added subscribers: sstefan1, wdng. · View Herald Transcript

Harbormaster completed remote builds in B139336: Diff 394425.Dec 14 2021, 5:55 PM

LG, thanks!

This revision is now accepted and ready to land.Dec 15 2021, 12:35 AM

Closed by commit rGd83dc4c64814: [OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add… (authored by carlo.bertolli). · Explain WhyDec 15 2021, 7:35 AM

This revision was automatically updated to reflect the committed changes.

carlo.bertolli added a commit: rGd83dc4c64814: [OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add….

Herald added a project: Restricted Project. · View Herald TranscriptDec 15 2021, 7:35 AM

Herald added a subscriber: openmp-commits. · View Herald Transcript

Revision Contents

Path

Size

openmp/

libomptarget/

plugins/

amdgpu/

src/

rtl.cpp

82 lines

Diff 394567

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Show First 20 Lines • Show All 343 Lines • ▼ Show 20 Lines
}		}

struct HSALifetime {		struct HSALifetime {
// Wrapper around HSA used to ensure it is constructed before other types		// Wrapper around HSA used to ensure it is constructed before other types
// and destructed after, which means said other types can use raii for		// and destructed after, which means said other types can use raii for
// cleanup without risking running outside of the lifetime of HSA		// cleanup without risking running outside of the lifetime of HSA
const hsa_status_t S;		const hsa_status_t S;

bool success() { return S == HSA_STATUS_SUCCESS; }		bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
HSALifetime() : S(hsa_init()) {}		HSALifetime() : S(hsa_init()) {}

~HSALifetime() {		~HSALifetime() {
if (S == HSA_STATUS_SUCCESS) {		if (S == HSA_STATUS_SUCCESS) {
hsa_status_t Err = hsa_shut_down();		hsa_status_t Err = hsa_shut_down();
if (Err != HSA_STATUS_SUCCESS) {		if (Err != HSA_STATUS_SUCCESS) {
// Can't call into HSA to get a string from the integer		// Can't call into HSA to get a string from the integer
DP("Shutting down HSA failed: %d\n", Err);		DP("Shutting down HSA failed: %d\n", Err);
}		}
}		}
}		}
};		};

		// Handle scheduling of multiple hsa_queue's per device to
		// multiple threads (one scheduler per device)
		class HSAQueueScheduler {
		public:
		HSAQueueScheduler() : current(0) {}

		HSAQueueScheduler(const HSAQueueScheduler &) = delete;

		HSAQueueScheduler(HSAQueueScheduler &&q) {
		current = q.current.load();
		for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
		HSAQueues[i] = q.HSAQueues[i];
		q.HSAQueues[i] = nullptr;
		}
		}

		// \return false if any HSA queue creation fails
		bool CreateQueues(hsa_agent_t HSAAgent, uint32_t queue_size) {
		for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
		hsa_queue_t *Q = nullptr;
		hsa_status_t rc =
		hsa_queue_create(HSAAgent, queue_size, HSA_QUEUE_TYPE_MULTI,
		callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
		if (rc != HSA_STATUS_SUCCESS) {
		DP("Failed to create HSA queue %d\n", i);
		return false;
		}
		HSAQueues[i] = Q;
		}
		return true;
		}

		~HSAQueueScheduler() {
		for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
		if (HSAQueues[i]) {
		hsa_status_t err = hsa_queue_destroy(HSAQueues[i]);
		if (err != HSA_STATUS_SUCCESS)
		DP("Error destroying HSA queue");
		}
		}
		}

		// \return next queue to use for device
		hsa_queue_t *Next() {
		return HSAQueues[(current.fetch_add(1, std::memory_order_relaxed)) %
		NUM_QUEUES_PER_DEVICE];
		}

		private:
		// Number of queues per device
		enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
		hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
		std::atomic<uint8_t> current;
		};

/// Class containing all the device information		/// Class containing all the device information
class RTLDeviceInfoTy {		class RTLDeviceInfoTy : HSALifetime {
HSALifetime HSA; // First field => constructed first and destructed last
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;		std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;

struct QueueDeleter {		struct QueueDeleter {
void operator()(hsa_queue_t *Q) {		void operator()(hsa_queue_t *Q) {
if (Q) {		if (Q) {
hsa_status_t Err = hsa_queue_destroy(Q);		hsa_status_t Err = hsa_queue_destroy(Q);
if (Err != HSA_STATUS_SUCCESS) {		if (Err != HSA_STATUS_SUCCESS) {
DP("Error destroying hsa queue: %s\n", get_error_string(Err));		DP("Error destroying hsa queue: %s\n", get_error_string(Err));
}		}
}		}
}		}
};		};

public:		public:
bool ConstructionSucceeded = false;		bool ConstructionSucceeded = false;

// load binary populates symbol tables and mutates various global state		// load binary populates symbol tables and mutates various global state
// run uses those symbol tables		// run uses those symbol tables
std::shared_timed_mutex load_run_lock;		std::shared_timed_mutex load_run_lock;

int NumberOfDevices = 0;		int NumberOfDevices = 0;

// GPU devices		// GPU devices
std::vector<hsa_agent_t> HSAAgents;		std::vector<hsa_agent_t> HSAAgents;
std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>		std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
HSAQueues; // one per gpu

// CPUs		// CPUs
std::vector<hsa_agent_t> CPUAgents;		std::vector<hsa_agent_t> CPUAgents;

// Device properties		// Device properties
std::vector<int> ComputeUnits;		std::vector<int> ComputeUnits;
std::vector<int> GroupsPerDevice;		std::vector<int> GroupsPerDevice;
std::vector<int> ThreadsPerGroup;		std::vector<int> ThreadsPerGroup;
▲ Show 20 Lines • Show All 250 Lines • ▼ Show 20 Lines	RTLDeviceInfoTy() {
DP("Start initializing " GETNAME(TARGET_NAME) "\n");		DP("Start initializing " GETNAME(TARGET_NAME) "\n");

// LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr		// LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
// anytime. You do not need a debug library build.		// anytime. You do not need a debug library build.
// 0 => no tracing		// 0 => no tracing
// 1 => tracing dispatch only		// 1 => tracing dispatch only
// >1 => verbosity increase		// >1 => verbosity increase

if (!HSA.success()) {		if (!HSAInitSuccess()) {
DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");		DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
return;		return;
}		}

if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))		if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
print_kernel_trace = atoi(envStr);		print_kernel_trace = atoi(envStr);
else		else
print_kernel_trace = 0;		print_kernel_trace = 0;
Show All 22 Lines	RTLDeviceInfoTy() {
if (NumberOfDevices == 0) {		if (NumberOfDevices == 0) {
DP("There are no devices supporting HSA.\n");		DP("There are no devices supporting HSA.\n");
return;		return;
} else {		} else {
DP("There are %d devices supporting HSA.\n", NumberOfDevices);		DP("There are %d devices supporting HSA.\n", NumberOfDevices);
}		}

// Init the device info		// Init the device info
HSAQueues.resize(NumberOfDevices);		HSAQueueSchedulers.reserve(NumberOfDevices);
FuncGblEntries.resize(NumberOfDevices);		FuncGblEntries.resize(NumberOfDevices);
ThreadsPerGroup.resize(NumberOfDevices);		ThreadsPerGroup.resize(NumberOfDevices);
ComputeUnits.resize(NumberOfDevices);		ComputeUnits.resize(NumberOfDevices);
GPUName.resize(NumberOfDevices);		GPUName.resize(NumberOfDevices);
GroupsPerDevice.resize(NumberOfDevices);		GroupsPerDevice.resize(NumberOfDevices);
WarpSize.resize(NumberOfDevices);		WarpSize.resize(NumberOfDevices);
NumTeams.resize(NumberOfDevices);		NumTeams.resize(NumberOfDevices);
NumThreads.resize(NumberOfDevices);		NumThreads.resize(NumberOfDevices);
Show All 26 Lines	for (int i = 0; i < NumberOfDevices; i++) {
}		}
enum { MaxQueueSize = 4096 };		enum { MaxQueueSize = 4096 };
if (queue_size > MaxQueueSize) {		if (queue_size > MaxQueueSize) {
queue_size = MaxQueueSize;		queue_size = MaxQueueSize;
}		}
}		}

{		{
hsa_queue_t *Q = nullptr;		HSAQueueScheduler QSched;
hsa_status_t rc =		if (!QSched.CreateQueues(HSAAgents[i], queue_size))
hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
if (rc != HSA_STATUS_SUCCESS) {
DP("Failed to create HSA queue %d\n", i);
return;		return;
}		HSAQueueSchedulers.emplace_back(std::move(QSched));
HSAQueues[i].reset(Q);
}		}

deviceStateStore[i] = {nullptr, 0};		deviceStateStore[i] = {nullptr, 0};
}		}

for (int i = 0; i < NumberOfDevices; i++) {		for (int i = 0; i < NumberOfDevices; i++) {
ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;		ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;		GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
Show All 11 Lines	RTLDeviceInfoTy() {
// Default state.		// Default state.
RequiresFlags = OMP_REQ_UNDEFINED;		RequiresFlags = OMP_REQ_UNDEFINED;

ConstructionSucceeded = true;		ConstructionSucceeded = true;
}		}

~RTLDeviceInfoTy() {		~RTLDeviceInfoTy() {
DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");		DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
if (!HSA.success()) {		if (!HSAInitSuccess()) {
// Then none of these can have been set up and they can't be torn down		// Then none of these can have been set up and they can't be torn down
return;		return;
}		}
// Run destructors on types that use HSA before		// Run destructors on types that use HSA before
// impl_finalize removes access to it		// impl_finalize removes access to it
deviceStateStore.clear();		deviceStateStore.clear();
KernelArgPoolMap.clear();		KernelArgPoolMap.clear();
// Terminate hostrpc before finalizing hsa		// Terminate hostrpc before finalizing hsa
▲ Show 20 Lines • Show All 320 Lines • ▼ Show 20 Lines	fprintf(traceToStdout ? stdout : stderr,
device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,		device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,		arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,		group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
vgpr_spill_count, loop_tripcount, KernelInfo->Name);		vgpr_spill_count, loop_tripcount, KernelInfo->Name);
}		}

// Run on the device.		// Run on the device.
{		{
hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();		hsa_queue_t *queue = DeviceInfo.HSAQueueSchedulers[device_id].Next();
if (!queue) {		if (!queue) {
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}
uint64_t packet_id = acquire_available_packet_id(queue);		uint64_t packet_id = acquire_available_packet_id(queue);

const uint32_t mask = queue->size - 1; // size is a power of 2		const uint32_t mask = queue->size - 1; // size is a power of 2
hsa_kernel_dispatch_packet_t *packet =		hsa_kernel_dispatch_packet_t *packet =
(hsa_kernel_dispatch_packet_t *)queue->base_address +		(hsa_kernel_dispatch_packet_t *)queue->base_address +
▲ Show 20 Lines • Show All 1,166 Lines • Show Last 20 Lines