This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
openmp/trunk/libomptarget/plugins/cuda/src/
-
trunk/
-
libomptarget/
-
plugins/
-
cuda/
-
src/
-
rtl.cpp

Differential D32321

[OpenMP] Optimized default kernel launch parameters in CUDA plugin
ClosedPublic

Authored by grokos on Apr 20 2017, 4:15 PM.

Download Raw Diff

Details

Reviewers

arpith-jacob
jlpeyton
Hahnfeld
jhen

Commits

rGc13df8e5e0ca: [OpenMP] Optimized default kernel launch parameters in CUDA plugin
rOMP301321: [OpenMP] Optimized default kernel launch parameters in CUDA plugin
rL301321: [OpenMP] Optimized default kernel launch parameters in CUDA plugin

Summary

This patch modifies the default target kernel launch parameters (num_teams and thread_limit). The default thread_limit is set to 128 threads per team. In SPMD mode the kernel is launched with 128 threads, in non-SPMD mode we use 96 threads (+32 of the master warp).

The default number of teams has been optimized as follows. For the constructs below:

#target teams distribute
#teams distribute
#target teams distribute simd
#teams distribute simd

if the associated loop trip count is N, then the kernel is launched with N teams.

Diff Detail

Repository: rL LLVM

Event Timeline

grokos created this revision.Apr 20 2017, 4:15 PM

Herald added a subscriber: rengolin. · View Herald TranscriptApr 20 2017, 4:15 PM

Does this change result in a lower runtime? Last time I tested clang-ykt on Pascal GPUs, 1024 threads were really the best thing to do...

libomptarget/plugins/cuda/src/rtl.cpp
594–598 ↗	(On Diff #96044)	Just move this code under `if (thread_limit > 0)`?
622–624 ↗	(On Diff #96044)	So each block executes one iteration? What is left for the threads in each block?

Hi Jonas,

The numbers are based on my testing of the Rodinia benchmark on k40m.

We don't have a working compiler on Pascal as yet (many of the omptests fail on Pascal) so I have not benchmarked on that GPU. Our compiler exposes a bug in the CUDA toolkit that is being fixed. It is possible that 1024 threads perform better on Pascal (these are of course heuristics) so we should extend the functionality here once Pascal support is added to the compiler/runtime.

libomptarget/plugins/cuda/src/rtl.cpp
622–624 ↗	(On Diff #96044)	Correct. This case is for the 'teams distribute' construct. The assumption is that there is a nested parallel construct in which the threads within the block participate. Example: #pragma omp target teams distribute for(...) { #pragma omp parallel for reduction(..) for(..) {} }

Wrote inline comments to make clear what the new default launch configuration is about.

LGTM

This revision is now accepted and ready to land.Apr 25 2017, 9:41 AM

Closed by commit rL301321: [OpenMP] Optimized default kernel launch parameters in CUDA plugin (authored by grokos). · Explain WhyApr 25 2017, 9:47 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

openmp/

trunk/

libomptarget/

plugins/

cuda/

src/

rtl.cpp

41 lines

Diff 96584

openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines

/// Keep entries table per device.		/// Keep entries table per device.
struct FuncOrGblEntryTy {		struct FuncOrGblEntryTy {
__tgt_target_table Table;		__tgt_target_table Table;
std::vector<__tgt_offload_entry> Entries;		std::vector<__tgt_offload_entry> Entries;
};		};

enum ExecutionModeType {		enum ExecutionModeType {
SPMD,		SPMD, // constructors, destructors,
GENERIC,		// combined constructs (`teams distribute parallel for [simd]`)
		GENERIC, // everything else
NONE		NONE
};		};

/// Use a single entity to encode a kernel and a set of flags		/// Use a single entity to encode a kernel and a set of flags
struct KernelTy {		struct KernelTy {
CUfunction Func;		CUfunction Func;

// execution mode of kernel		// execution mode of kernel
Show All 30 Lines	public:
// OpenMP Environment properties		// OpenMP Environment properties
int EnvNumTeams;		int EnvNumTeams;
int EnvTeamLimit;		int EnvTeamLimit;

//static int EnvNumThreads;		//static int EnvNumThreads;
static const int HardTeamLimit = 1<<16; // 64k		static const int HardTeamLimit = 1<<16; // 64k
static const int HardThreadLimit = 1024;		static const int HardThreadLimit = 1024;
static const int DefaultNumTeams = 128;		static const int DefaultNumTeams = 128;
static const int DefaultNumThreads = 1024;		static const int DefaultNumThreads = 128;

// Record entry point associated with device		// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {		void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&		assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");		"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id];		FuncOrGblEntryTy &E = FuncGblEntries[device_id];

E.Entries.push_back(entry);		E.Entries.push_back(entry);
▲ Show 20 Lines • Show All 465 Lines • ▼ Show 20 Lines	int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,

KernelTy KernelInfo = (KernelTy )tgt_entry_ptr;		KernelTy KernelInfo = (KernelTy )tgt_entry_ptr;

int cudaThreadsPerBlock;		int cudaThreadsPerBlock;

if (thread_limit > 0) {		if (thread_limit > 0) {
cudaThreadsPerBlock = thread_limit;		cudaThreadsPerBlock = thread_limit;
DP("Setting CUDA threads per block to requested %d\n", thread_limit);		DP("Setting CUDA threads per block to requested %d\n", thread_limit);
} else {
cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
DP("Setting CUDA threads per block to default %d\n",
DeviceInfo.NumThreads[device_id]);
}

// Add master warp if necessary		// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {		if (KernelInfo->ExecutionMode == GENERIC) {
cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];		cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);		DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
}		}
		} else {
		cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
		DP("Setting CUDA threads per block to default %d\n",
		DeviceInfo.NumThreads[device_id]);
		}

if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {		if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];		cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
DP("Threads per block capped at device limit %d\n",		DP("Threads per block capped at device limit %d\n",
DeviceInfo.ThreadsPerBlock[device_id]);		DeviceInfo.ThreadsPerBlock[device_id]);
}		}

int kernel_limit;		int kernel_limit;
err = cuFuncGetAttribute(&kernel_limit,		err = cuFuncGetAttribute(&kernel_limit,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);		CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
if (err == CUDA_SUCCESS) {		if (err == CUDA_SUCCESS) {
if (kernel_limit < cudaThreadsPerBlock) {		if (kernel_limit < cudaThreadsPerBlock) {
cudaThreadsPerBlock = kernel_limit;		cudaThreadsPerBlock = kernel_limit;
DP("Threads per block capped at kernel limit %d\n", kernel_limit);		DP("Threads per block capped at kernel limit %d\n", kernel_limit);
}		}
}		}

int cudaBlocksPerGrid;		int cudaBlocksPerGrid;
if (team_num <= 0) {		if (team_num <= 0) {
if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {		if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
		if (KernelInfo->ExecutionMode == SPMD) {
		// We have a combined construct, i.e. `target teams distribute parallel
		// for [simd]`. We launch so many teams so that each thread will
		// execute one iteration of the loop.
// round up to the nearest integer		// round up to the nearest integer
cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;		cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
		} else {
		// If we reach this point, then we have a non-combined construct, i.e.
		// `teams distribute` with a nested `parallel for` and each team is
		// assigned one iteration of the `distribute` loop. E.g.:
		//
		// #pragma omp target teams distribute
		// for(...loop_tripcount...) {
		// #pragma omp parallel for
		// for(...) {}
		// }
		//
		// Threads within a team will execute the iterations of the `parallel`
		// loop.
		cudaBlocksPerGrid = loop_tripcount;
		}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "		DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,		"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
cudaThreadsPerBlock);		cudaThreadsPerBlock);
} else {		} else {
cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];		cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);		DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
}		}
} else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {		} else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines