This is an archive of the discontinued LLVM Phabricator instance.

tra pointed out that device_functions.hpp uses USE_FAST_MATH for its own
purposes. For this CL, we only want to define USE_FAST_MATH around
math_functions.hpp.

LGTM.

Closed by commit rL270484: [CUDA] Add -fcuda-approx-transcendentals flag. (authored by jlebar). · Explain WhyMay 23 2016, 1:26 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

cfe/

trunk/

include/

clang/

Basic/

LangOptions.def

1 line

Driver/

Options.td

3 lines

lib/

Driver/

ToolChains.cpp

4 lines

Frontend/

CompilerInvocation.cpp

3 lines

InitPreprocessor.cpp

6 lines

Headers/

__clang_cuda_runtime_wrapper.h

14 lines

test/

Preprocessor/

cuda-approx-transcendentals.cu

8 lines

Diff 58145

cfe/trunk/include/clang/Basic/LangOptions.def

	Show First 20 Lines • Show All 184 Lines • ▼ Show 20 Lines
	LANGOPT(OpenMP , 1, 0, "OpenMP support")			LANGOPT(OpenMP , 1, 0, "OpenMP support")
	LANGOPT(OpenMPUseTLS , 1, 0, "Use TLS for threadprivates or runtime calls")			LANGOPT(OpenMPUseTLS , 1, 0, "Use TLS for threadprivates or runtime calls")
	LANGOPT(OpenMPIsDevice , 1, 0, "Generate code only for OpenMP target device")			LANGOPT(OpenMPIsDevice , 1, 0, "Generate code only for OpenMP target device")

	LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")			LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
	LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")			LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
	LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")			LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
	LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")			LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
				LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")

	LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")			LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
	LANGOPT(ConceptsTS , 1, 0, "enable C++ Extensions for Concepts")			LANGOPT(ConceptsTS , 1, 0, "enable C++ Extensions for Concepts")
	BENIGN_LANGOPT(ElideConstructors , 1, 1, "C++ copy constructor elision")			BENIGN_LANGOPT(ElideConstructors , 1, 1, "C++ copy constructor elision")
	BENIGN_LANGOPT(DumpRecordLayouts , 1, 0, "dumping the layout of IRgen'd records")			BENIGN_LANGOPT(DumpRecordLayouts , 1, 0, "dumping the layout of IRgen'd records")
	BENIGN_LANGOPT(DumpRecordLayoutsSimple , 1, 0, "dumping the layout of IRgen'd records in a simple form")			BENIGN_LANGOPT(DumpRecordLayoutsSimple , 1, 0, "dumping the layout of IRgen'd records in a simple form")
	BENIGN_LANGOPT(DumpVTableLayouts , 1, 0, "dumping the layouts of emitted vtables")			BENIGN_LANGOPT(DumpVTableLayouts , 1, 0, "dumping the layouts of emitted vtables")
	LANGOPT(NoConstantCFStrings , 1, 0, "no constant CoreFoundation strings")			LANGOPT(NoConstantCFStrings , 1, 0, "no constant CoreFoundation strings")
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

cfe/trunk/include/clang/Driver/Options.td

	Show First 20 Lines • Show All 389 Lines • ▼ Show 20 Lines
	def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,			def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
	HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;			HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
	def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;			def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
	def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,			def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
	HelpText<"CUDA installation path">;			HelpText<"CUDA installation path">;
	def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,			def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
	Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;			Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
	def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;			def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
				def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
				Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
				def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
	def dA : Flag<["-"], "dA">, Group<d_Group>;			def dA : Flag<["-"], "dA">, Group<d_Group>;
	def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,			def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode in addition to normal output">;			HelpText<"Print macro definitions in -E mode in addition to normal output">;
	def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,			def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode instead of normal output">;			HelpText<"Print macro definitions in -E mode instead of normal output">;
	def dead__strip : Flag<["-"], "dead_strip">;			def dead__strip : Flag<["-"], "dead_strip">;
	def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,			def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,
	HelpText<"Filename (or -) to write dependency output to">;			HelpText<"Filename (or -) to write dependency output to">;
	▲ Show 20 Lines • Show All 1,806 Lines • Show Last 20 Lines

cfe/trunk/lib/Driver/ToolChains.cpp

Show First 20 Lines • Show All 4,496 Lines • ▼ Show 20 Lines	CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args) const {		llvm::opt::ArgStringList &CC1Args) const {
Linux::addClangTargetOptions(DriverArgs, CC1Args);		Linux::addClangTargetOptions(DriverArgs, CC1Args);
CC1Args.push_back("-fcuda-is-device");		CC1Args.push_back("-fcuda-is-device");

if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,		if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
options::OPT_fno_cuda_flush_denormals_to_zero, false))		options::OPT_fno_cuda_flush_denormals_to_zero, false))
CC1Args.push_back("-fcuda-flush-denormals-to-zero");		CC1Args.push_back("-fcuda-flush-denormals-to-zero");

		if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
		options::OPT_fno_cuda_approx_transcendentals, false))
		CC1Args.push_back("-fcuda-approx-transcendentals");

if (DriverArgs.hasArg(options::OPT_nocudalib))		if (DriverArgs.hasArg(options::OPT_nocudalib))
return;		return;

std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(		std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(
DriverArgs.getLastArgValue(options::OPT_march_EQ));		DriverArgs.getLastArgValue(options::OPT_march_EQ));
if (!LibDeviceFile.empty()) {		if (!LibDeviceFile.empty()) {
CC1Args.push_back("-mlink-cuda-bitcode");		CC1Args.push_back("-mlink-cuda-bitcode");
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));		CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
▲ Show 20 Lines • Show All 359 Lines • Show Last 20 Lines

cfe/trunk/lib/Frontend/CompilerInvocation.cpp

Show First 20 Lines • Show All 1,610 Lines • ▼ Show 20 Lines	if (Args.hasArg(OPT_fcuda_allow_variadic_functions))
Opts.CUDAAllowVariadicFunctions = 1;		Opts.CUDAAllowVariadicFunctions = 1;

if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))		if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
Opts.CUDAHostDeviceConstexpr = 0;		Opts.CUDAHostDeviceConstexpr = 0;

if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))		if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
Opts.CUDADeviceFlushDenormalsToZero = 1;		Opts.CUDADeviceFlushDenormalsToZero = 1;

		if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
		Opts.CUDADeviceApproxTranscendentals = 1;

if (Opts.ObjC1) {		if (Opts.ObjC1) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {		if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
StringRef value = arg->getValue();		StringRef value = arg->getValue();
if (Opts.ObjCRuntime.tryParse(value))		if (Opts.ObjCRuntime.tryParse(value))
Diags.Report(diag::err_drv_unknown_objc_runtime) << value;		Diags.Report(diag::err_drv_unknown_objc_runtime) << value;
}		}

if (Args.hasArg(OPT_fobjc_gc_only))		if (Args.hasArg(OPT_fobjc_gc_only))
▲ Show 20 Lines • Show All 882 Lines • Show Last 20 Lines

cfe/trunk/lib/Frontend/InitPreprocessor.cpp

Show First 20 Lines • Show All 932 Lines • ▼ Show 20 Lines	#undef DEFINE_LOCK_FREE_MACRO

// CUDA device path compilaton		// CUDA device path compilaton
if (LangOpts.CUDAIsDevice) {		if (LangOpts.CUDAIsDevice) {
// The CUDA_ARCH value is set for the GPU target specified in the NVPTX		// The CUDA_ARCH value is set for the GPU target specified in the NVPTX
// backend's target defines.		// backend's target defines.
Builder.defineMacro("__CUDA_ARCH__");		Builder.defineMacro("__CUDA_ARCH__");
}		}

		// We need to communicate this to our CUDA header wrapper, which in turn
		// informs the proper CUDA headers of this choice.
		if (LangOpts.CUDADeviceApproxTranscendentals \|\| LangOpts.FastMath) {
		Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
		}

// OpenCL definitions.		// OpenCL definitions.
if (LangOpts.OpenCL) {		if (LangOpts.OpenCL) {
#define OPENCLEXT(Ext) \		#define OPENCLEXT(Ext) \
if (TI.getSupportedOpenCLOpts().is_##Ext##_supported( \		if (TI.getSupportedOpenCLOpts().is_##Ext##_supported( \
LangOpts.OpenCLVersion)) \		LangOpts.OpenCLVersion)) \
Builder.defineMacro(#Ext);		Builder.defineMacro(#Ext);
#include "clang/Basic/OpenCLExtensions.def"		#include "clang/Basic/OpenCLExtensions.def"
}		}
▲ Show 20 Lines • Show All 97 Lines • Show Last 20 Lines

cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h

	Show First 20 Lines • Show All 136 Lines • ▼ Show 20 Lines

	// device_functions.hpp and math_functions*.hpp use 'static			// device_functions.hpp and math_functions*.hpp use 'static
	// __forceinline__' (with no __device__) for definitions of device			// __forceinline__' (with no __device__) for definitions of device
	// functions. Temporarily redefine __forceinline__ to include			// functions. Temporarily redefine __forceinline__ to include
	// __device__.			// __device__.
	#pragma push_macro("__forceinline__")			#pragma push_macro("__forceinline__")
	#define __forceinline__ __device__ __inline__ __attribute__((always_inline))			#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
	#include "device_functions.hpp"			#include "device_functions.hpp"

				// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
				// get the slow-but-accurate or fast-but-inaccurate versions of functions like
				// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
				//
				// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
				// slow divides), so we need to scope our define carefully here.
				#pragma push_macro("__USE_FAST_MATH__")
				#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
				#define __USE_FAST_MATH__
				#endif
	#include "math_functions.hpp"			#include "math_functions.hpp"
				#pragma pop_macro("__USE_FAST_MATH__")

	#include "math_functions_dbl_ptx3.hpp"			#include "math_functions_dbl_ptx3.hpp"
	#pragma pop_macro("__forceinline__")			#pragma pop_macro("__forceinline__")

	// Pull in host-only functions that are only available when neither			// Pull in host-only functions that are only available when neither
	// __CUDACC__ nor __CUDABE__ are defined.			// __CUDACC__ nor __CUDABE__ are defined.
	#undef __MATH_FUNCTIONS_HPP__			#undef __MATH_FUNCTIONS_HPP__
	#undef __CUDABE__			#undef __CUDABE__
	#include "math_functions.hpp"			#include "math_functions.hpp"
	▲ Show 20 Lines • Show All 137 Lines • ▼ Show 20 Lines
	// used here for the redeclarations of blockDim and threadIdx.)			// used here for the redeclarations of blockDim and threadIdx.)
	#pragma push_macro("dim3")			#pragma push_macro("dim3")
	#pragma push_macro("uint3")			#pragma push_macro("uint3")
	#define dim3 __cuda_builtin_blockDim_t			#define dim3 __cuda_builtin_blockDim_t
	#define uint3 __cuda_builtin_threadIdx_t			#define uint3 __cuda_builtin_threadIdx_t
	#include "curand_mtgp32_kernel.h"			#include "curand_mtgp32_kernel.h"
	#pragma pop_macro("dim3")			#pragma pop_macro("dim3")
	#pragma pop_macro("uint3")			#pragma pop_macro("uint3")
				#pragma pop_macro("__USE_FAST_MATH__")

	#endif // __CUDA__			#endif // __CUDA__
	#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__			#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

cfe/trunk/test/Preprocessor/cuda-approx-transcendentals.cu

				// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \| FileCheck --check-prefix HOST %s
				// RUN: %clang --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \| FileCheck --check-prefix DEVICE-NOFAST %s
				// RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \| FileCheck --check-prefix DEVICE-FAST %s
				// RUN: %clang -ffast-math --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \| FileCheck --check-prefix DEVICE-FAST %s

				// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
				// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
				// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Add -fcuda-approx-transcendentals flag.ClosedPublic

Details

Diff Detail

Event Timeline