This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
openmp/libomptarget/plugins/cuda/src/
-
libomptarget/
-
plugins/
-
cuda/
-
src/
3
rtl.cpp

Differential D86038

[OpenMP][CUDA] Cache the maximal number of threads per block (per kernel)
ClosedPublic

Authored by jdoerfert on Aug 16 2020, 8:57 AM.

Download Raw Diff

Details

Reviewers

JonChesterfield
ye-luo
ABataev
grokos
tianshilei1992

Commits

rGaa27cfc1e7d7: [OpenMP][CUDA] Cache the maximal number of threads per block (per kernel)

Summary

Instead of calling cuFuncGetAttribute with
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK for every kernel invocation,
we can do it for the first one and cache the result as part of the
KernelInfo struct. The only functional change is that we now expect
cuFuncGetAttribute to succeed and otherwise propagate the error.
Ignoring any error seems like a slippery slope...

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jdoerfert created this revision.Aug 16 2020, 8:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 16 2020, 8:57 AM

Herald added subscribers: guansong, bollu, yaxunl. · View Herald Transcript

jdoerfert requested review of this revision.Aug 16 2020, 8:57 AM

Herald added a subscriber: sstefan1. · View Herald TranscriptAug 16 2020, 8:57 AM

Harbormaster completed remote builds in B68552: Diff 285889.Aug 16 2020, 9:23 AM

LGTM

openmp/libomptarget/plugins/cuda/src/rtl.cpp
892	tab

This revision is now accepted and ready to land.Aug 16 2020, 9:27 AM

tianshilei1992 added inline comments.Aug 16 2020, 9:59 AM

openmp/libomptarget/plugins/cuda/src/rtl.cpp
79	Do we need to change the front end? This data structure should be generated by the FE, right?

tianshilei1992 added inline comments.Aug 16 2020, 10:15 AM

openmp/libomptarget/plugins/cuda/src/rtl.cpp
79	Oh, my bad. It is not. The pointer is replaced during the initialization.

This revision was landed with ongoing or failed builds.Aug 16 2020, 12:40 PM

Closed by commit rGaa27cfc1e7d7: [OpenMP][CUDA] Cache the maximal number of threads per block (per kernel) (authored by jdoerfert). · Explain Why

This revision was automatically updated to reflect the committed changes.

jdoerfert added a commit: rGaa27cfc1e7d7: [OpenMP][CUDA] Cache the maximal number of threads per block (per kernel).

Revision Contents

Path

Size

openmp/

libomptarget/

plugins/

cuda/

src/

rtl.cpp

34 lines

Diff 285889

openmp/libomptarget/plugins/cuda/src/rtl.cpp

//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//		//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// RTL for CUDA machine		// RTL for CUDA machine
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include <cassert>		#include <cassert>
#include <cstddef>		#include <cstddef>
#include <cuda.h>		#include <cuda.h>
		Lint: Pre-merge checks Inline Actions clang-tidy: error: 'cuda.h' file not found [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: 'cuda.h' file not found [clang-diagnostic-error] [[https://github.
#include <list>		#include <list>
#include <memory>		#include <memory>
#include <mutex>		#include <mutex>
#include <string>		#include <string>
#include <vector>		#include <vector>

#include "omptargetplugin.h"		#include "omptargetplugin.h"

▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
struct KernelTy {		struct KernelTy {
CUfunction Func;		CUfunction Func;

// execution mode of kernel		// execution mode of kernel
// 0 - SPMD mode (without master warp)		// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)		// 1 - Generic mode (with master warp)
int8_t ExecutionMode;		int8_t ExecutionMode;

		/// Maximal number of threads per block for this kernel.
		int MaxThreadsPerBlock = 0;
		tianshilei1992Unsubmitted Not Done Reply Inline Actions Do we need to change the front end? This data structure should be generated by the FE, right? tianshilei1992: Do we need to change the front end? This data structure should be generated by the FE, right?
		tianshilei1992Unsubmitted Not Done Reply Inline Actions Oh, my bad. It is not. The pointer is replaced during the initialization. tianshilei1992: Oh, my bad. It is not. The pointer is replaced during the initialization.

KernelTy(CUfunction _Func, int8_t _ExecutionMode)		KernelTy(CUfunction _Func, int8_t _ExecutionMode)
: Func(_Func), ExecutionMode(_ExecutionMode) {}		: Func(_Func), ExecutionMode(_ExecutionMode) {}
};		};

/// Device environment data		/// Device environment data
/// Manually sync with the deviceRTL side for now, move to a dedicated header		/// Manually sync with the deviceRTL side for now, move to a dedicated header
/// file later.		/// file later.
struct omptarget_device_environmentTy {		struct omptarget_device_environmentTy {
▲ Show 20 Lines • Show All 752 Lines • ▼ Show 20 Lines	int dataDelete(const int DeviceId, void *TgtPtr) const {

Err = cuMemFree((CUdeviceptr)TgtPtr);		Err = cuMemFree((CUdeviceptr)TgtPtr);
if (!checkResult(Err, "Error returned from cuMemFree\n"))		if (!checkResult(Err, "Error returned from cuMemFree\n"))
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}

int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,		int runTargetTeamRegion(const int DeviceId, void TgtEntryPtr, void *TgtArgs,
void *TgtArgs, ptrdiff_t TgtOffsets,		ptrdiff_t *TgtOffsets, const int ArgNum,
const int ArgNum, const int TeamNum,		const int TeamNum, const int ThreadLimit,
const int ThreadLimit,
const unsigned int LoopTripCount,		const unsigned int LoopTripCount,
__tgt_async_info *AsyncInfo) const {		__tgt_async_info *AsyncInfo) const {
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;

// All args are references.		// All args are references.
std::vector<void *> Args(ArgNum);		std::vector<void *> Args(ArgNum);
std::vector<void *> Ptrs(ArgNum);		std::vector<void *> Ptrs(ArgNum);

for (int I = 0; I < ArgNum; ++I) {		for (int I = 0; I < ArgNum; ++I) {
Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);		Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
Args[I] = &Ptrs[I];		Args[I] = &Ptrs[I];
}		}

const KernelTy *KernelInfo =		KernelTy KernelInfo = reinterpret_cast<KernelTy >(TgtEntryPtr);
reinterpret_cast<const KernelTy *>(TgtEntryPtr);

unsigned int CudaThreadsPerBlock;		int CudaThreadsPerBlock;
if (ThreadLimit > 0) {		if (ThreadLimit > 0) {
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);		DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
CudaThreadsPerBlock = ThreadLimit;		CudaThreadsPerBlock = ThreadLimit;
// Add master warp if necessary		// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {		if (KernelInfo->ExecutionMode == GENERIC) {
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);		DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;		CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
}		}
} else {		} else {
DP("Setting CUDA threads per block to default %d\n",		DP("Setting CUDA threads per block to default %d\n",
DeviceData[DeviceId].NumThreads);		DeviceData[DeviceId].NumThreads);
CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;		CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
}		}

if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {		if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
DP("Threads per block capped at device limit %d\n",		DP("Threads per block capped at device limit %d\n",
DeviceData[DeviceId].ThreadsPerBlock);		DeviceData[DeviceId].ThreadsPerBlock);
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;		CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
}		}

int KernelLimit;		if (!KernelInfo->MaxThreadsPerBlock) {
Err = cuFuncGetAttribute(&KernelLimit,		Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,		CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
		JonChesterfieldUnsubmitted Not Done Reply Inline Actions tab JonChesterfield: tab
KernelInfo->Func);		KernelInfo->Func);
if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {		if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
DP("Threads per block capped at kernel limit %d\n", KernelLimit);		return OFFLOAD_FAIL;
CudaThreadsPerBlock = KernelLimit;		}

		if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
		DP("Threads per block capped at kernel limit %d\n",
		KernelInfo->MaxThreadsPerBlock);
		CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
}		}

unsigned int CudaBlocksPerGrid;		unsigned int CudaBlocksPerGrid;
if (TeamNum <= 0) {		if (TeamNum <= 0) {
if (LoopTripCount > 0 && EnvNumTeams < 0) {		if (LoopTripCount > 0 && EnvNumTeams < 0) {
if (KernelInfo->ExecutionMode == SPMD) {		if (KernelInfo->ExecutionMode == SPMD) {
// We have a combined construct, i.e. `target teams distribute		// We have a combined construct, i.e. `target teams distribute
// parallel for [simd]`. We launch so many teams so that each thread		// parallel for [simd]`. We launch so many teams so that each thread
▲ Show 20 Lines • Show All 269 Lines • Show Last 20 Lines