This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Refine rcp/rsq intrinsic folding for modern FP rules
ClosedPublic

Authored by arsenm on May 22 2020, 5:14 AM.

Download Raw Diff

Details

Reviewers

foad
rampitec
b-sumner
nhaehnle

Summary

We have to assume undef could be an snan, which would need quieting so
returning qnan is safer than undef. Also consider strictfp, and don't
care if the result rounded.

Diff Detail

Event Timeline

arsenm created this revision.May 22 2020, 5:14 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 22 2020, 5:14 AM

Herald added subscribers: kerbowa, hiraditya, t-tye and 6 others. · View Herald Transcript

We have to assume undef could be an snan, which would need quieting so returning qnan is safer than undef.

So you've chosen to optimize assuming that an undef input was a nan. Is that better than assuming it was something more ordinary like 0?

In D80432#2050967, @foad wrote:

We have to assume undef could be an snan, which would need quieting so returning qnan is safer than undef.

So you've chosen to optimize assuming that an undef input was a nan. Is that better than assuming it was something more ordinary like 0?

qnan matches the current fdiv handling. nan also enables folding out more use operations more consistently

rampitec accepted this revision.May 22 2020, 10:21 AM

This revision is now accepted and ready to land.May 22 2020, 10:21 AM

27fe841aa650a24fd98da2fb6c6eb2fca806a63f

Revision Contents

Path

Size

llvm/

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

31 lines

test/

Transforms/

InstCombine/

AMDGPU/

amdgcn-intrinsics.ll

27 lines

Diff 265708

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 3,494 Lines • ▼ Show 20 Lines	if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
CarryKnown))		CarryKnown))
return II;		return II;
break;		break;
}		}
case Intrinsic::amdgcn_rcp: {		case Intrinsic::amdgcn_rcp: {
Value *Src = II->getArgOperand(0);		Value *Src = II->getArgOperand(0);

// TODO: Move to ConstantFolding/InstSimplify?		// TODO: Move to ConstantFolding/InstSimplify?
if (isa<UndefValue>(Src))		if (isa<UndefValue>(Src)) {
return replaceInstUsesWith(CI, Src);		Type *Ty = II->getType();
		auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
		return replaceInstUsesWith(CI, QNaN);
		}

		if (II->isStrictFP())
		break;

if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {		if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
const APFloat &ArgVal = C->getValueAPF();		const APFloat &ArgVal = C->getValueAPF();
APFloat Val(ArgVal.getSemantics(), 1);		APFloat Val(ArgVal.getSemantics(), 1);
APFloat::opStatus Status = Val.divide(ArgVal,		Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
APFloat::rmNearestTiesToEven);
// Only do this if it was exact and therefore not dependent on the		// This is more precise than the instruction may give.
// rounding mode.		//
if (Status == APFloat::opOK)		// TODO: The instruction always flushes denormal results (except for f16),
		// should this also?
return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));		return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
}		}

break;		break;
}		}
case Intrinsic::amdgcn_rsq: {		case Intrinsic::amdgcn_rsq: {
Value *Src = II->getArgOperand(0);		Value *Src = II->getArgOperand(0);

// TODO: Move to ConstantFolding/InstSimplify?		// TODO: Move to ConstantFolding/InstSimplify?
if (isa<UndefValue>(Src))		if (isa<UndefValue>(Src)) {
return replaceInstUsesWith(CI, Src);		Type *Ty = II->getType();
		auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
		return replaceInstUsesWith(CI, QNaN);
		}

break;		break;
}		}
case Intrinsic::amdgcn_frexp_mant:		case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp: {		case Intrinsic::amdgcn_frexp_exp: {
Value *Src = II->getArgOperand(0);		Value *Src = II->getArgOperand(0);
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {		if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
int Exp;		int Exp;
APFloat Significand = frexp(C->getValueAPF(), Exp,		APFloat Significand = frexp(C->getValueAPF(), Exp,
▲ Show 20 Lines • Show All 1,608 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -instcombine -S < %s \| FileCheck %s			; RUN: opt -instcombine -S < %s \| FileCheck %s

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.rcp			; llvm.amdgcn.rcp
	; --------------------------------------------------------------------			; --------------------------------------------------------------------

	declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone			declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
	declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone			declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone

	define float @test_constant_fold_rcp_f32_undef() nounwind {			define float @test_constant_fold_rcp_f32_undef() nounwind {
	; CHECK-LABEL: @test_constant_fold_rcp_f32_undef(			; CHECK-LABEL: @test_constant_fold_rcp_f32_undef(
	; CHECK-NEXT: ret float undef			; CHECK-NEXT: ret float 0x7FF8000000000000
	;			;
	%val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone			%val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
	ret float %val			ret float %val
	}			}

	define float @test_constant_fold_rcp_f32_1() nounwind {			define float @test_constant_fold_rcp_f32_1() nounwind {
	; CHECK-LABEL: @test_constant_fold_rcp_f32_1(			; CHECK-LABEL: @test_constant_fold_rcp_f32_1(
	; CHECK-NEXT: ret float 1.000000e+00			; CHECK-NEXT: ret float 1.000000e+00
	Show All 23 Lines
	; CHECK-NEXT: ret double 2.000000e+00			; CHECK-NEXT: ret double 2.000000e+00
	;			;
	%val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone			%val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone
	ret double %val			ret double %val
	}			}

	define float @test_constant_fold_rcp_f32_43() nounwind {			define float @test_constant_fold_rcp_f32_43() nounwind {
	; CHECK-LABEL: @test_constant_fold_rcp_f32_43(			; CHECK-LABEL: @test_constant_fold_rcp_f32_43(
	; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01)			; CHECK-NEXT: ret float 0x3F97D05F40000000
	; CHECK-NEXT: ret float [[VAL]]
	;			;
	%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone			%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
	ret float %val			ret float %val
	}			}

	define double @test_constant_fold_rcp_f64_43() nounwind {			define double @test_constant_fold_rcp_f64_43() nounwind {
	; CHECK-LABEL: @test_constant_fold_rcp_f64_43(			; CHECK-LABEL: @test_constant_fold_rcp_f64_43(
	; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01)			; CHECK-NEXT: ret double 0x3F97D05F417D05F4
	; CHECK-NEXT: ret double [[VAL]]
	;			;
	%val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone			%val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
	ret double %val			ret double %val
	}			}

				define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {
				; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(
				; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #7
				; CHECK-NEXT: ret float [[VAL]]
				;
				%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone
				ret float %val
				}

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.rsq			; llvm.amdgcn.rsq
	; --------------------------------------------------------------------			; --------------------------------------------------------------------

	declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone			declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone

	define float @test_constant_fold_rsq_f32_undef() nounwind {			define float @test_constant_fold_rsq_f32_undef() nounwind {
	; CHECK-LABEL: @test_constant_fold_rsq_f32_undef(			; CHECK-LABEL: @test_constant_fold_rsq_f32_undef(
	; CHECK-NEXT: ret float undef			; CHECK-NEXT: ret float 0x7FF8000000000000
	;			;
	%val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone			%val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
	ret float %val			ret float %val
	}			}

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.frexp.mant			; llvm.amdgcn.frexp.mant
	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	▲ Show 20 Lines • Show All 2,296 Lines • ▼ Show 20 Lines
	; llvm.amdgcn.ballot			; llvm.amdgcn.ballot
	; --------------------------------------------------------------------			; --------------------------------------------------------------------

	declare i64 @llvm.amdgcn.ballot.i64(i1) nounwind readnone convergent			declare i64 @llvm.amdgcn.ballot.i64(i1) nounwind readnone convergent
	declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent			declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent

	define i64 @ballot_nocombine_64(i1 %i) {			define i64 @ballot_nocombine_64(i1 %i) {
	; CHECK-LABEL: @ballot_nocombine_64(			; CHECK-LABEL: @ballot_nocombine_64(
	; CHECK-NEXT: %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)			; CHECK-NEXT: [[B:%.]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.]])
	; CHECK-NEXT: ret i64 %b			; CHECK-NEXT: ret i64 [[B]]
	;			;
	%b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)			%b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)
	ret i64 %b			ret i64 %b
	}			}

	define i64 @ballot_zero_64() {			define i64 @ballot_zero_64() {
	; CHECK-LABEL: @ballot_zero_64(			; CHECK-LABEL: @ballot_zero_64(
	; CHECK-NEXT: ret i64 0			; CHECK-NEXT: ret i64 0
	;			;
	%b = call i64 @llvm.amdgcn.ballot.i64(i1 0)			%b = call i64 @llvm.amdgcn.ballot.i64(i1 0)
	ret i64 %b			ret i64 %b
	}			}

	define i64 @ballot_one_64() {			define i64 @ballot_one_64() {
	; CHECK-LABEL: @ballot_one_64(			; CHECK-LABEL: @ballot_one_64(
	; CHECK-NEXT: %b = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT]]			; CHECK-NEXT: %b = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT]]
	; CHECK-NEXT: ret i64 %b			; CHECK-NEXT: ret i64 %b
	;			;
	%b = call i64 @llvm.amdgcn.ballot.i64(i1 1)			%b = call i64 @llvm.amdgcn.ballot.i64(i1 1)
	ret i64 %b			ret i64 %b
	}			}

	define i32 @ballot_nocombine_32(i1 %i) {			define i32 @ballot_nocombine_32(i1 %i) {
	; CHECK-LABEL: @ballot_nocombine_32(			; CHECK-LABEL: @ballot_nocombine_32(
	; CHECK-NEXT: %b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)			; CHECK-NEXT: [[B:%.]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.]])
	; CHECK-NEXT: ret i32 %b			; CHECK-NEXT: ret i32 [[B]]
	;			;
	%b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)			%b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
	ret i32 %b			ret i32 %b
	}			}

	define i32 @ballot_zero_32() {			define i32 @ballot_zero_32() {
	; CHECK-LABEL: @ballot_zero_32(			; CHECK-LABEL: @ballot_zero_32(
	; CHECK-NEXT: ret i32 0			; CHECK-NEXT: ret i32 0
	▲ Show 20 Lines • Show All 371 Lines • Show Last 20 Lines