This is an archive of the discontinued LLVM Phabricator instance.

clang: Attach !fpmath metadata to __builtin_sqrt based on language flags
ClosedPublic

Authored by arsenm on Jul 5 2023, 4:42 AM.

Download Raw Diff

Details

Reviewers

yaxunl
Anastasia
jcranmer-intel
tra
jlebar
jhuber6
jdoerfert

Summary

OpenCL and HIP have -cl-fp32-correctly-rounded-divide-sqrt and
-fno-hip-correctly-rounded-divide-sqrt. The corresponding fpmath metadata
was only set on fdiv, and not sqrt. The backend is currently underutilizing
sqrt lowering options, and the responsibility is split between the libraries
and backend and this metadata is needed.

CUDA/NVCC has -prec-div and -prev-sqrt but clang doesn't appear to be
aiming for compatibility with those. Don't know if OpenMP has a similar
control.

Diff Detail

Event Timeline

arsenm created this revision.Jul 5 2023, 4:42 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 5 2023, 4:42 AM

arsenm requested review of this revision.Jul 5 2023, 4:42 AM

Herald added a reviewer: jdoerfert. · View Herald TranscriptJul 5 2023, 4:42 AM

Herald added subscribers: jplehr, sstefan1, wdng. · View Herald Transcript

Harbormaster completed remote builds in B243178: Diff 537295.Jul 5 2023, 5:14 AM

yaxunl added inline comments.Jul 5 2023, 12:59 PM

clang/lib/CodeGen/CGExpr.cpp
5594	the spec says sqrt relative error is 3ULP https://registry.khronos.org/OpenCL/specs/2.2/html/OpenCL_C.html#relative-error-as-ulps

arsenm added inline comments.Jul 5 2023, 1:49 PM

clang/lib/CodeGen/CGExpr.cpp
5594	Did that change between versions? In any case I don’t want to change the currently used threshold in this patch. We only need 1.0 anyway

arsenm added inline comments.Jul 5 2023, 2:30 PM

clang/lib/CodeGen/CGExpr.cpp
5594	Oh, I see the threshold is 2.5 for fdiv and 3.0 for sqrt.

Split div/sqrt handling since they have different values. Also cuda does have unimplemented flags to control these individually. Not sure it's worth trying to merge them into one function

Harbormaster completed remote builds in B243477: Diff 537737.Jul 6 2023, 8:56 AM

FWIW, I assume we want this also for OpenMP offload.

In D154495#4479481, @jdoerfert wrote:

FWIW, I assume we want this also for OpenMP offload.

I'd be surprised if OpenMP let you do this by default

ping

LGTM. Thanks.

This revision is now accepted and ready to land.Jul 14 2023, 10:54 AM

bac2a075408377a8aa41f6626b17bb3e471221f3

Revision Contents

Path

Size

clang/

lib/

CodeGen/

11 lines

42 lines

16 lines

8 lines

test/

CodeGenCUDA/

correctly-rounded-div.cu

16 lines

CodeGenOpenCL/

fpmath.cl

29 lines

Diff 537737

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,526 Lines • ▼ Show 20 Lines	if (FD->hasAttr<ConstAttr>() \|\|

case Builtin::BIsqrt:		case Builtin::BIsqrt:
case Builtin::BIsqrtf:		case Builtin::BIsqrtf:
case Builtin::BIsqrtl:		case Builtin::BIsqrtl:
case Builtin::BI__builtin_sqrt:		case Builtin::BI__builtin_sqrt:
case Builtin::BI__builtin_sqrtf:		case Builtin::BI__builtin_sqrtf:
case Builtin::BI__builtin_sqrtf16:		case Builtin::BI__builtin_sqrtf16:
case Builtin::BI__builtin_sqrtl:		case Builtin::BI__builtin_sqrtl:
case Builtin::BI__builtin_sqrtf128:		case Builtin::BI__builtin_sqrtf128: {
return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,		llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
Intrinsic::sqrt,		*this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
Intrinsic::experimental_constrained_sqrt));		SetSqrtFPAccuracy(Call);
		return RValue::get(Call);
		}
case Builtin::BItrunc:		case Builtin::BItrunc:
case Builtin::BItruncf:		case Builtin::BItruncf:
case Builtin::BItruncl:		case Builtin::BItruncl:
case Builtin::BI__builtin_trunc:		case Builtin::BI__builtin_trunc:
case Builtin::BI__builtin_truncf:		case Builtin::BI__builtin_truncf:
case Builtin::BI__builtin_truncf16:		case Builtin::BI__builtin_truncf16:
case Builtin::BI__builtin_truncl:		case Builtin::BI__builtin_truncl:
case Builtin::BI__builtin_truncf128:		case Builtin::BI__builtin_truncf128:
▲ Show 20 Lines • Show All 17,977 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGExpr.cpp

Show First 20 Lines • Show All 5,571 Lines • ▼ Show 20 Lines	if (Accuracy == 0.0 \|\| !isa<llvm::Instruction>(Val))
return;		return;

llvm::MDBuilder MDHelper(getLLVMContext());		llvm::MDBuilder MDHelper(getLLVMContext());
llvm::MDNode *Node = MDHelper.createFPMath(Accuracy);		llvm::MDNode *Node = MDHelper.createFPMath(Accuracy);

cast<llvm::Instruction>(Val)->setMetadata(llvm::LLVMContext::MD_fpmath, Node);		cast<llvm::Instruction>(Val)->setMetadata(llvm::LLVMContext::MD_fpmath, Node);
}		}

		void CodeGenFunction::SetSqrtFPAccuracy(llvm::Value *Val) {
		llvm::Type *EltTy = Val->getType()->getScalarType();
		if (!EltTy->isFloatTy())
		return;

		if ((getLangOpts().OpenCL &&
		!CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) \|\|
		(getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
		!CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
		// OpenCL v1.1 s7.4: minimum accuracy of single precision / is 3ulp
		//
		// OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
		// build option allows an application to specify that single precision
		// floating-point divide (x/y and 1/x) and sqrt used in the program
		// source are correctly rounded.
		yaxunlUnsubmitted Not Done Reply Inline Actions the spec says sqrt relative error is 3ULP https://registry.khronos.org/OpenCL/specs/2.2/html/OpenCL_C.html#relative-error-as-ulps yaxunl: the spec says sqrt relative error is 3ULP https://registry.khronos.org/OpenCL/specs/2.
		arsenmAuthorUnsubmitted Done Reply Inline Actions Did that change between versions? In any case I don’t want to change the currently used threshold in this patch. We only need 1.0 anyway arsenm: Did that change between versions? In any case I don’t want to change the currently used…
		arsenmAuthorUnsubmitted Done Reply Inline Actions Oh, I see the threshold is 2.5 for fdiv and 3.0 for sqrt. arsenm: Oh, I see the threshold is 2.5 for fdiv and 3.0 for sqrt.
		//
		// TODO: CUDA has a prec-sqrt flag
		SetFPAccuracy(Val, 3.0f);
		}
		}

		void CodeGenFunction::SetDivFPAccuracy(llvm::Value *Val) {
		llvm::Type *EltTy = Val->getType()->getScalarType();
		if (!EltTy->isFloatTy())
		return;

		if ((getLangOpts().OpenCL &&
		!CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) \|\|
		(getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
		!CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
		// OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
		//
		// OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
		// build option allows an application to specify that single precision
		// floating-point divide (x/y and 1/x) and sqrt used in the program
		// source are correctly rounded.
		//
		// TODO: CUDA has a prec-div flag
		SetFPAccuracy(Val, 2.5f);
		}
		}

namespace {		namespace {
struct LValueOrRValue {		struct LValueOrRValue {
LValue LV;		LValue LV;
RValue RV;		RValue RV;
};		};
}		}

static LValueOrRValue emitPseudoObjectExpr(CodeGenFunction &CGF,		static LValueOrRValue emitPseudoObjectExpr(CodeGenFunction &CGF,
▲ Show 20 Lines • Show All 79 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGExprScalar.cpp

Show First 20 Lines • Show All 3,472 Lines • ▼ Show 20 Lines	if (Ops.Ty->isConstantMatrixType()) {
return MB.CreateScalarDiv(Ops.LHS, Ops.RHS,		return MB.CreateScalarDiv(Ops.LHS, Ops.RHS,
Ops.Ty->hasUnsignedIntegerRepresentation());		Ops.Ty->hasUnsignedIntegerRepresentation());
}		}

if (Ops.LHS->getType()->isFPOrFPVectorTy()) {		if (Ops.LHS->getType()->isFPOrFPVectorTy()) {
llvm::Value *Val;		llvm::Value *Val;
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);		CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);
Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");		Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");
if ((CGF.getLangOpts().OpenCL &&		CGF.SetDivFPAccuracy(Val);
!CGF.CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) \|\|
(CGF.getLangOpts().HIP && CGF.getLangOpts().CUDAIsDevice &&
!CGF.CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
// OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
// OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
// build option allows an application to specify that single precision
// floating-point divide (x/y and 1/x) and sqrt used in the program
// source are correctly rounded.
llvm::Type *ValTy = Val->getType();
if (ValTy->isFloatTy() \|\|
(isa<llvm::VectorType>(ValTy) &&
cast<llvm::VectorType>(ValTy)->getElementType()->isFloatTy()))
CGF.SetFPAccuracy(Val, 2.5);
}
return Val;		return Val;
}		}
else if (Ops.isFixedPointOp())		else if (Ops.isFixedPointOp())
return EmitFixedPointBinOp(Ops);		return EmitFixedPointBinOp(Ops);
else if (Ops.Ty->hasUnsignedIntegerRepresentation())		else if (Ops.Ty->hasUnsignedIntegerRepresentation())
return Builder.CreateUDiv(Ops.LHS, Ops.RHS, "div");		return Builder.CreateUDiv(Ops.LHS, Ops.RHS, "div");
else		else
return Builder.CreateSDiv(Ops.LHS, Ops.RHS, "div");		return Builder.CreateSDiv(Ops.LHS, Ops.RHS, "div");
▲ Show 20 Lines • Show All 1,930 Lines • Show Last 20 Lines

clang/lib/CodeGen/CodeGenFunction.h

Show First 20 Lines • Show All 4,698 Lines • ▼ Show 20 Lines	public:
/// a r-value suitable for passing the given parameter.		/// a r-value suitable for passing the given parameter.
void EmitDelegateCallArg(CallArgList &args, const VarDecl *param,		void EmitDelegateCallArg(CallArgList &args, const VarDecl *param,
SourceLocation loc);		SourceLocation loc);

/// SetFPAccuracy - Set the minimum required accuracy of the given floating		/// SetFPAccuracy - Set the minimum required accuracy of the given floating
/// point operation, expressed as the maximum relative error in ulp.		/// point operation, expressed as the maximum relative error in ulp.
void SetFPAccuracy(llvm::Value *Val, float Accuracy);		void SetFPAccuracy(llvm::Value *Val, float Accuracy);

		/// Set the minimum required accuracy of the given sqrt operation
		/// based on CodeGenOpts.
		void SetSqrtFPAccuracy(llvm::Value *Val);

		/// Set the minimum required accuracy of the given sqrt operation based on
		/// CodeGenOpts.
		void SetDivFPAccuracy(llvm::Value *Val);

/// Set the codegen fast-math flags.		/// Set the codegen fast-math flags.
void SetFastMathFlags(FPOptions FPFeatures);		void SetFastMathFlags(FPOptions FPFeatures);

// Truncate or extend a boolean vector to the requested number of elements.		// Truncate or extend a boolean vector to the requested number of elements.
llvm::Value emitBoolVecConversion(llvm::Value SrcVec,		llvm::Value emitBoolVecConversion(llvm::Value SrcVec,
unsigned NumElementsDst,		unsigned NumElementsDst,
const llvm::Twine &Name = "");		const llvm::Twine &Name = "");

▲ Show 20 Lines • Show All 199 Lines • Show Last 20 Lines

clang/test/CodeGenCUDA/correctly-rounded-div.cu

	Show All 26 Lines
	}			}

	// COMMON-LABEL: @_Z11dpscalardiv			// COMMON-LABEL: @_Z11dpscalardiv
	// COMMON-NOT: !fpmath			// COMMON-NOT: !fpmath
	__device__ double dpscalardiv(double a, double b) {			__device__ double dpscalardiv(double a, double b) {
	return a / b;			return a / b;
	}			}

	// NCRDIV: ![[MD]] = !{float 2.500000e+00}			// COMMON-LABEL: @_Z12spscalarsqrt
				// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
				// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
				__device__ float spscalarsqrt(float a) {
				return __builtin_sqrtf(a);
				}

				// COMMON-LABEL: @_Z12dpscalarsqrt
				// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
				// COMMON-NOT: !fpmath
				__device__ double dpscalarsqrt(double a) {
				return __builtin_sqrt(a);
				}

				// NCRSQRT: ![[MD]] = !{float 2.500000e+00}

clang/test/CodeGenOpenCL/fpmath.cl

	// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown \| FileCheck --check-prefix=CHECK --check-prefix=NODIVOPT %s			// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown \| FileCheck --check-prefix=CHECK --check-prefix=NODIVOPT %s
	// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -cl-fp32-correctly-rounded-divide-sqrt \| FileCheck --check-prefix=CHECK --check-prefix=DIVOPT %s			// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -cl-fp32-correctly-rounded-divide-sqrt \| FileCheck --check-prefix=CHECK --check-prefix=DIVOPT %s
	// RUN: %clang_cc1 %s -emit-llvm -o - -DNOFP64 -cl-std=CL1.2 -triple r600-unknown-unknown -target-cpu r600 -pedantic \| FileCheck --check-prefix=CHECK-FLT %s			// RUN: %clang_cc1 %s -emit-llvm -o - -DNOFP64 -cl-std=CL1.2 -triple r600-unknown-unknown -target-cpu r600 -pedantic \| FileCheck --check-prefix=CHECK-FLT %s
	// RUN: %clang_cc1 %s -emit-llvm -o - -DFP64 -cl-std=CL1.2 -triple spir-unknown-unknown -pedantic \| FileCheck --check-prefix=CHECK-DBL %s			// RUN: %clang_cc1 %s -emit-llvm -o - -DFP64 -cl-std=CL1.2 -triple spir-unknown-unknown -pedantic \| FileCheck --check-prefix=CHECK-DBL %s

	typedef __attribute__(( ext_vector_type(4) )) float float4;			typedef __attribute__(( ext_vector_type(4) )) float float4;

	float spscalardiv(float a, float b) {			float spscalardiv(float a, float b) {
	// CHECK: @spscalardiv			// CHECK: @spscalardiv
	// CHECK: fdiv{{.*}},			// CHECK: fdiv{{.*}},
	// NODIVOPT: !fpmath ![[MD:[0-9]+]]			// NODIVOPT: !fpmath ![[MD_FDIV:[0-9]+]]
	// DIVOPT-NOT: !fpmath !{{[0-9]+}}			// DIVOPT-NOT: !fpmath !{{[0-9]+}}
	return a / b;			return a / b;
	}			}

	float4 spvectordiv(float4 a, float4 b) {			float4 spvectordiv(float4 a, float4 b) {
	// CHECK: @spvectordiv			// CHECK: @spvectordiv
	// CHECK: fdiv{{.*}},			// CHECK: fdiv{{.*}},
	// NODIVOPT: !fpmath ![[MD]]			// NODIVOPT: !fpmath ![[MD_FDIV]]
	// DIVOPT-NOT: !fpmath !{{[0-9]+}}			// DIVOPT-NOT: !fpmath !{{[0-9]+}}
	return a / b;			return a / b;
	}			}

				float spscalarsqrt(float a) {
				// CHECK-LABEL: @spscalarsqrt
				// NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
				// DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
				return __builtin_sqrtf(a);
				}

	#if __OPENCL_C_VERSION__ >=120			#if __OPENCL_C_VERSION__ >=120
	void printf(constant char* fmt, ...);			void printf(constant char* fmt, ...);

	void testdbllit(long *val) {			void testdbllit(long *val) {
	// CHECK-FLT: float noundef 2.000000e+01			// CHECK-FLT: float noundef 2.000000e+01
	// CHECK-DBL: double noundef 2.000000e+01			// CHECK-DBL: double noundef 2.000000e+01
	printf("%f", 20.0);			printf("%f", 20.0);
	}			}

	#endif			#endif

	#ifndef NOFP64			#ifndef NOFP64
	#pragma OPENCL EXTENSION cl_khr_fp64 : enable			#pragma OPENCL EXTENSION cl_khr_fp64 : enable
				typedef __attribute__(( ext_vector_type(4) )) double double4;

	double dpscalardiv(double a, double b) {			double dpscalardiv(double a, double b) {
	// CHECK: @dpscalardiv			// CHECK: @dpscalardiv
	// CHECK-NOT: !fpmath			// CHECK-NOT: !fpmath
	return a / b;			return a / b;
	}			}

				double4 dpvectordiv(double4 a, double4 b) {
				// CHECK: @dpvectordiv
				// CHECK-NOT: !fpmath
				return a / b;
				}

				double dpscalarsqrt(double a) {
				// CHECK-LABEL: @dpscalarsqrt
				// CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
				return __builtin_sqrt(a);
				}

	#endif			#endif

	// NODIVOPT: ![[MD]] = !{float 2.500000e+00}			// NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
				// NODIVOPT: ![[MD_SQRT]] = !{float 3.000000e+00}