Diff 277889

clang/test/OpenMP/remarks_parallel_in_target_state_machine.c

This file was added.

				// RUN: %clang_cc1 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
				// RUN: %clang_cc1 -verify -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
				// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out

				// host-no-diagnostics

				void bar(void) {
				#pragma omp parallel // #1
				Lint: Pre-merge checks Inline Actions clang-format: please reformat the code -#pragma omp parallel // #1 - // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}} +#pragma omp parallel // #1 \ + // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}} \ Lint: Pre-merge checks: clang-format: please reformat the code ``` -#pragma omp parallel // #1 - //…
				// expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}}
				// expected-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}}
				{
				}
				}
				ye-luoUnsubmitted Not Done Reply Inline Actions Add a space "machineomp_outlined2_wrapper" to "machine omp_outlined2_wrapper" ye-luo: Add a space "machine__omp_outlined__2_wrapper" to "machine __omp_outlined__2_wrapper"

				void foo(void) {
				#pragma omp target teams // #2
				Lint: Pre-merge checks Inline Actions clang-format: please reformat the code -#pragma omp target teams // #2 - // expected-remark@#2 {{Target region containing the parallel region that can be specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22_}} +#pragma omp target teams // #2 \ + // expected-remark@#2 {{Target region containing the parallel region that can be specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22_}} \ Lint: Pre-merge checks: clang-format: please reformat the code ``` -#pragma omp target teams // #2…
				// expected-remark@#2 {{Target region containing the parallel region that can be specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22_}}
				// expected-remark@#2 {{Target region containing the parallel region that can be specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22_}}
				{
				#pragma omp parallel // #3
				Lint: Pre-merge checks Inline Actions clang-format: please reformat the code -#pragma omp parallel // #3 - // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}} +#pragma omp parallel // #3 \ + // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}} \ Lint: Pre-merge checks: clang-format: please reformat the code ``` -#pragma omp parallel // #3 - //…
				// expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}}
				// expected-remark@#3 {{Specialize parallel region reached only from a single target region. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}}
				{
				}
				bar();
				#pragma omp parallel // #4
				Lint: Pre-merge checks Inline Actions clang-format: please reformat the code -#pragma omp parallel // #4 - // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}} +#pragma omp parallel // #4 \ + // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}} \ Lint: Pre-merge checks: clang-format: please reformat the code ``` -#pragma omp parallel // #4 - //…
				// expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage in unrelated kernels in the same translation unit due to spurious call edges assumed by ptxas.}}
				// expected-remark@#4 {{Specialize parallel region reached only from a single target region. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}}
				{
				}
				}
				}

				// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}}
				// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Show First 20 Lines • Show All 1,027 Lines • ▼ Show 20 Lines	bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
if (!KernelPrepareParallelRFI)		if (!KernelPrepareParallelRFI)
return Changed;		return Changed;

for (Function *F : SCC) {		for (Function *F : SCC) {

// Check if the function is uses in a __kmpc_kernel_prepare_parallel call at		// Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
// all.		// all.
bool UnknownUse = false;		bool UnknownUse = false;
		bool KernelPrepareUse = false;
unsigned NumDirectCalls = 0;		unsigned NumDirectCalls = 0;

SmallVector<Use *, 2> ToBeReplacedStateMachineUses;		SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
foreachUse(*F, [&](Use &U) {		foreachUse(*F, [&](Use &U) {
if (auto *CB = dyn_cast<CallBase>(U.getUser()))		if (auto *CB = dyn_cast<CallBase>(U.getUser()))
if (CB->isCallee(&U)) {		if (CB->isCallee(&U)) {
++NumDirectCalls;		++NumDirectCalls;
return;		return;
}		}

if (isa<ICmpInst>(U.getUser())) {		if (isa<ICmpInst>(U.getUser())) {
ToBeReplacedStateMachineUses.push_back(&U);		ToBeReplacedStateMachineUses.push_back(&U);
return;		return;
}		}
if (OpenMPOpt::getCallIfRegularCall(*U.getUser(),		if (OpenMPOpt::getCallIfRegularCall(*U.getUser(),
&KernelPrepareParallelRFI)) {		&KernelPrepareParallelRFI)) {
		KernelPrepareUse = true;
ToBeReplacedStateMachineUses.push_back(&U);		ToBeReplacedStateMachineUses.push_back(&U);
return;		return;
}		}
UnknownUse = true;		UnknownUse = true;
});		});

// If this ever hits, we should investigate.		// Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
if (UnknownUse \|\| NumDirectCalls != 1)		// use.
		if (!KernelPrepareUse)
continue;		continue;

// TODO: This is not a necessary restriction and should be lifted.		{
if (ToBeReplacedStateMachineUses.size() != 2)		auto Remark = [&](OptimizationRemark OR) {
		return OR << "Found a parallel region that is called in a target "
		"region but not part of a combined target construct nor "
		"nesed inside a target construct without intermediate "
		"code. This can lead to excessive register usage in "
		"unrelated kernels in the same translation unit due to "
		"spurious call edges assumed by ptxas.";
		};
		emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
		}

		// If this ever hits, we should investigate.
		// TODO: Checking the number of uses is not a necessary restriction and
		// should be lifted.
		if (UnknownUse \|\| NumDirectCalls != 1 \|\|
		ToBeReplacedStateMachineUses.size() != 2) {
		{
		auto Remark = [&](OptimizationRemark OR) {
		return OR << "Parallel region is used in "
		<< (UnknownUse ? "unknown" : "unexpected")
		<< " ways; will not attempt to rewrite the state machine.";
		};
		emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
		}
continue;		continue;
		}

// Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give		// Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
// up if the function is not called from a unique kernel.		// up if the function is not called from a unique kernel.
Kernel K = getUniqueKernelFor(*F);		Kernel K = getUniqueKernelFor(*F);
if (!K)		if (!K) {
		{
		auto Remark = [&](OptimizationRemark OR) {
		return OR << "Parallel region is not known to be called from a "
		"unique single target region, maybe the surrounding "
		"function has external linkage?; will not attempt to "
		"rewrite the state machine use.";
		};
		emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
		Remark);
		}
continue;		continue;
		}

// We now know F is a parallel body function called only from the kernel K.		// We now know F is a parallel body function called only from the kernel K.
// We also identified the state machine uses in which we replace the		// We also identified the state machine uses in which we replace the
// function pointer by a new global symbol for identification purposes. This		// function pointer by a new global symbol for identification purposes. This
// ensures only direct calls to the function are left.		// ensures only direct calls to the function are left.

		{
		auto RemarkParalleRegion = [&](OptimizationRemark OR) {
		return OR << "Specialize parallel region reached only from a single "
		"target region. (parallel region ID: "
		<< ore::NV("OpenMPParallelRegion", F->getName())
		<< ", kernel ID: "
		<< ore::NV("OpenMPTargetRegion", K->getName()) << ")";
		};
		emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
		RemarkParalleRegion);
		auto RemarkKernel = [&](OptimizationRemark OR) {
		return OR << "Target region containing the parallel region that can be "
		"specialized. (parallel region ID: "
		<< ore::NV("OpenMPParallelRegion", F->getName())
		<< ", kernel ID: "
		<< ore::NV("OpenMPTargetRegion", K->getName()) << ")";
		};
		emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
		}

Module &M = *F->getParent();		Module &M = *F->getParent();
Type *Int8Ty = Type::getInt8Ty(M.getContext());		Type *Int8Ty = Type::getInt8Ty(M.getContext());

auto *ID = new GlobalVariable(		auto *ID = new GlobalVariable(
M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,		M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
UndefValue::get(Int8Ty), F->getName() + ".ID");		UndefValue::get(Int8Ty), F->getName() + ".ID");

for (Use *U : ToBeReplacedStateMachineUses)		for (Use *U : ToBeReplacedStateMachineUses)
▲ Show 20 Lines • Show All 347 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP][NFC] Emit remarks during GPU state machine optimization
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 277889

clang/test/OpenMP/remarks_parallel_in_target_state_machine.c

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP][NFC] Emit remarks during GPU state machine optimizationClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 277889

clang/test/OpenMP/remarks_parallel_in_target_state_machine.c

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

[OpenMP][NFC] Emit remarks during GPU state machine optimization
ClosedPublic