Diff 241807

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

	Show All 12 Lines
	BO.replaceAllUsesWith(NewSelect);			BO.replaceAllUsesWith(NewSelect);
	BO.eraseFromParent();			BO.eraseFromParent();
	if (CastOp)			if (CastOp)
	CastOp->eraseFromParent();			CastOp->eraseFromParent();
	Sel->eraseFromParent();			Sel->eraseFromParent();
	return true;			return true;
	}			}

	// Perform RCP optimizations:			// lowerUsingRcp:
	//			//
	// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with			// 1/x -> rcp(x) when fdiv is allowed to be re-associated or rcp is accurate.
				arsenmUnsubmitted Not Done Reply Inline Actions This has nothing to do with reassociation arsenm: This has nothing to do with reassociation
				cfangAuthorUnsubmitted Done Reply Inline Actions Division re-association: a/b -> a * rcp(b), and one special case is 1.0/b => 1.0rcp(b) = rcp(b). This is how 1.0/x -> rcp(x) associated with "re-association". cfang:* Division re-association: a/b -> a * rcp(b), and one special case is 1.0/b => 1.0*rcp(b) = rcp…
				arsenmUnsubmitted Not Done Reply Inline Actions This isn't reassocation. This is just special handling of 1.0/b. Nothing algebraic changes here. There's no multiply introduced here arsenm: This isn't reassocation. This is just special handling of 1.0/b. Nothing algebraic changes here.
				cfangAuthorUnsubmitted Done Reply Inline Actions Ok, it seems we have a different understanding here. I think this is still just a naming issue. Originally the name is something like UnsafeMath? But I do think arcp also gives the permission to do 1.0/x -> rcp(x) even though no multitply is explicitly generated (1.0 * rcp(x) = rcp(x)). Maybe we should go back to use the original name as HasFastUnsafeOptions to clear your confusion? cfang: Ok, it seems we have a different understanding here. I think this is still just a naming issue.
				arsenmUnsubmitted Not Done Reply Inline Actions It's also the comment and flag being checked. There's no implicit or explicit multiply here, this is just a reciprocal. This pass is not responsible for doing the reassociating allowed by arcp. This should be a check for the approximate function math flag. allowRecriprocal is not relevant here arsenm: It's also the comment and flag being checked. There's no implicit or explicit multiply here…
				cfangAuthorUnsubmitted Done Reply Inline Actions So we only check unsafe-fast-math or afn here, not arcp? Thanks. cfang: So we only check unsafe-fast-math or afn here, not arcp? Thanks.
				arsenmUnsubmitted Not Done Reply Inline Actions afn, and the closest match for the attribute is unsafe-fast-math, and not arcp. arsenm: afn, and the closest match for the attribute is unsafe-fast-math, and not arcp.
	// denormals flushed.
	//			//
	// a/b -> a*rcp(b) when fast unsafe rcp is legal.			// a/b -> a*rcp(b) when fdiv is allowed to be re-associated.
	static Value performRCPOpt(Value Num, Value *Den, bool FastUnsafeRcpLegal,			static Value lowerUsingRcp (Value Num, Value *Den, bool CanReassociateFDiv,
				arsenmUnsubmitted Not Done Reply Inline Actions This should not be referred to ass lowering arsenm: This should not be referred to ass lowering
				cfangAuthorUnsubmitted Done Reply Inline Actions I am thinking of a different name. Do you have a meaningful name for the function in mind? cfang: I am thinking of a different name. Do you have a meaningful name for the function in mind?
				arsenmUnsubmitted Not Done Reply Inline Actions combineRcp? arsenm: combineRcp?
				cfangAuthorUnsubmitted Done Reply Inline Actions Better to be somethingUseRcp and somethingUseFastFDiv. I am still not sure what something should be here. optimizeFDivUsingRcp? cfang: Better to be somethingUseRcp and somethingUseFastFDiv. I am still not sure what something…
	IRBuilder<> Builder, MDNode FPMath, Module Mod,			bool RcpIsAccurate, IRBuilder<> Builder,
	bool HasDenormals, bool NeedHighAccuracy) {			MDNode FPMath, Module Mod) {

	Type *Ty = Den->getType();			if (!CanReassociateFDiv && !RcpIsAccurate)
	if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
	(HasDenormals \|\| NeedHighAccuracy))
	return nullptr;			return nullptr;

				Type *Ty = Den->getType();
	Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);			Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
	if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {			if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
	if (FastUnsafeRcpLegal \|\| Ty->isFloatTy() \|\| Ty->isHalfTy()) {			if (CanReassociateFDiv \|\| RcpIsAccurate) {
				arsenmUnsubmitted Not Done Reply Inline Actions We aren't fdiv here. We're handling an fdiv, and not splitting it into a multiple and rcp arsenm: We aren't fdiv here. We're handling an fdiv, and not splitting it into a multiple and rcp
				cfangAuthorUnsubmitted Done Reply Inline Actions As explained in a previous comment, 1.0/x -> 1.0rcp(x) = rcp(x) is a special case of re-association. As a result, if the options specify re-association, we can do 1.0/x -> rcp(x). cfang:* As explained in a previous comment, 1.0/x -> 1.0*rcp(x) = rcp(x) is a special case of re…
	if (CLHS->isExactlyValue(1.0)) {			if (CLHS->isExactlyValue(1.0)) {
	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to			// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
	// the CI documentation has a worst case error of 1 ulp.			// the CI documentation has a worst case error of 1 ulp.
	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to			// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
	// use it as long as we aren't trying to use denormals.			// use it as long as we aren't trying to use denormals.
	//			//
	// v_rcp_f16 and v_rsq_f16 DO support denormals.			// v_rcp_f16 and v_rsq_f16 DO support denormals.

	// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't			// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
	// insert rsq intrinsic here.			// insert rsq intrinsic here.

	// 1.0 / x -> rcp(x)			// 1.0 / x -> rcp(x)
	return Builder.CreateCall(Decl, { Den });			return Builder.CreateCall(Decl, { Den });
	}			}

	// Same as for 1.0, but expand the sign out of the constant.			// Same as for 1.0, but expand the sign out of the constant.
	if (CLHS->isExactlyValue(-1.0)) {			if (CLHS->isExactlyValue(-1.0)) {
	// -1.0 / x -> rcp (fneg x)			// -1.0 / x -> rcp (fneg x)
	Value *FNeg = Builder.CreateFNeg(Den);			Value *FNeg = Builder.CreateFNeg(Den);
	return Builder.CreateCall(Decl, { FNeg });			return Builder.CreateCall(Decl, { FNeg });
	}			}
	}			}
	}			}

	if (FastUnsafeRcpLegal) {			if (CanReassociateFDiv) {
	// Turn into multiply by the reciprocal.			// Turn into multiply by the reciprocal.
	// x / y -> x * (1.0 / y)			// x / y -> x * (1.0 / y)
	Value *Recip = Builder.CreateCall(Decl, { Den });			Value *Recip = Builder.CreateCall(Decl, { Den });
	return Builder.CreateFMul(Num, Recip, "", FPMath);			return Builder.CreateFMul(Num, Recip, "", FPMath);
	}			}
	return nullptr;			return nullptr;
	}			}

	static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,			// lowerUsingFDivFast:
	bool HasDenormals) {			//
	const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);			// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
	if (!CNum)			//
	return HasDenormals;			// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
				//
				// NOTE: lowerUsingRcp should be tried first because rcp is the preference.
				static Value lowerUsingFDivFast(Value Num, Value *Den, float ReqdAccuracy,
				bool HasDenormals, IRBuilder<> Builder,
				MDNode FPMath, Module Mod) {
				// fdiv.fast can achieve 2.5 ULP accuracy.
				if (ReqdAccuracy < 2.5f)
				return nullptr;

	if (FastUnsafeRcpLegal)			// Only have fdiv.fast for f32.
	return true;			Type *Ty = Den->getType();
				if (!Ty->isFloatTy())
				return nullptr;

	bool IsOne = CNum->isExactlyValue(+1.0) \|\| CNum->isExactlyValue(-1.0);			bool NumIsOne = false;
				if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
				if (CNum->isExactlyValue(+1.0) \|\| CNum->isExactlyValue(-1.0))
				NumIsOne = true;
				}

	// Reciprocal f32 is handled separately without denormals.			// fdiv does not support denormals. But 1.0/x is always fine to use it.
	return HasDenormals ^ IsOne;			if (HasDenormals && !NumIsOne)
	}			return nullptr;

				Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
				return Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
				}

	// Optimizations is performed based on fpmath, fast math flags as wells as			// Optimizations is performed based on fpmath, fast math flags as wells as
	// denormals to lower fdiv using either rcp or fdiv.fast.			// denormals to lower fdiv using either rcp or fdiv.fast.
	//			//
	// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on			// Use rcp:
	// unsafe-fp-math, fast math flags, denormals and fpmath			// 1/x -> rcp(x) when fdiv is allowed to be re-associated or rcp is
	// accuracy request.			// sufficiently accurate.
	//			//
	// RCP Optimizations:			// a/b -> a*rcp(b) fdiv is allowed to be re-associated.
	// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
	// denormals flushed.
	// a/b -> a*rcp(b) when fast unsafe rcp is legal.
	//			//
	// Use fdiv.fast:			// Use fdiv.fast:
	// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and			// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
	// fpmath >= 2.5ULP with denormals flushed.			//
				// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
	//			//
	// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and			// Using rcp is the preference.
	// fpmath >= 2.5ULP with denormals.
	bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {			bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {

	Type *Ty = FDiv.getType()->getScalarType();			Type *Ty = FDiv.getType()->getScalarType();

	// No intrinsic for fdiv16 if target does not support f16.			// No intrinsic for fdiv16 if target does not support f16.
	if (Ty->isHalfTy() && !ST->has16BitInsts())			if (Ty->isHalfTy() && !ST->has16BitInsts())
	return false;			return false;

	const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);			const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
				arsenmUnsubmitted Done Reply Inline Actions fdiv.fast doesn't' care about the reassociation arsenm: fdiv.fast doesn't' care about the reassociation
				cfangAuthorUnsubmitted Done Reply Inline Actions You are right. This is just the optimization priority issue. If we can reassociate fdiv, x/y -> x * rcp(y) is faster than fdiv.fast so we don't do fdiv.fast. cfang: You are right. This is just the optimization priority issue. If we can reassociate fdiv, x/y…
				arsenmUnsubmitted Not Done Reply Inline Actions The comment and variable name are misleading, as no reassociate is going on here. This needs an explanation here arsenm: The comment and variable name are misleading, as no reassociate is going on here. This needs an…
				cfangAuthorUnsubmitted Done Reply Inline Actions I am going to write an explanation here. But I am confused about fdiv.fast intrinsic: 1.0/x -> fdiv.fast (1.0, x) when denormals are supported. Because I think does not support fdiv.fast. cfang: I am going to write an explanation here. But I am confused about fdiv.fast intrinsic: 1.0/x ->…
				arsenmUnsubmitted Not Done Reply Inline Actions I'm not sure what the question is. fdiv.fast is used depending on whether the denormal mode needs to be switched or not, and is separate from rcp. If we can use rcp, it's preferable to fdiv.fast arsenm: I'm not sure what the question is. fdiv.fast is used depending on whether the denormal mode…
	MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);			MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
				arsenmUnsubmitted Not Done Reply Inline Actions You don't need this anymore with getFPAccuracy arsenm: You don't need this anymore with getFPAccuracy
				cfangAuthorUnsubmitted Done Reply Inline Actions Do you think we will no longer need fpmath metadata after this point? in fdiv.fast intrinsic or the fdiv itself which may not be changed (especially in the vector cases). cfang: Do you think we will no longer need fpmath metadata after this point? in fdiv.fast intrinsic or…
				arsenmUnsubmitted Not Done Reply Inline Actions getFPAccuracy already did the only check you needed for it, so this should be a dead variable arsenm: getFPAccuracy already did the only check you needed for it, so this should be a dead variable
	const bool NeedHighAccuracy = !FPMath \|\| FPOp->getFPAccuracy() < 2.5f;			const float ReqdAccuracy = FPOp->getFPAccuracy();

	FastMathFlags FMF = FPOp->getFastMathFlags();			FastMathFlags FMF = FPOp->getFastMathFlags();
	// Determine whether it is ok to use rcp based on unsafe-fp-math,			const bool CanReassociateFDiv = HasUnsafeFPMath \|\| FMF.allowReciprocal();
	// fast math flags, denormals and accuracy request.
	const bool FastUnsafeRcpLegal = HasUnsafeFPMath \|\| FMF.isFast() \|\|
	(FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
	\|\| FMF.approxFunc()));

	// Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.			// rcp_f16 is accurate for !fpmath >= 1.0ulp.
	const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&			// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
	!FastUnsafeRcpLegal;			// rcp_f64 is never accurate.
				const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) \|\|
				(Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);

	IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));			IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
	Builder.setFastMathFlags(FMF);			Builder.setFastMathFlags(FMF);
	Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());			Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());

	Value *Num = FDiv.getOperand(0);			Value *Num = FDiv.getOperand(0);
	Value *Den = FDiv.getOperand(1);			Value *Den = FDiv.getOperand(1);

	Value *NewFDiv = nullptr;			Value *NewFDiv = nullptr;
	if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {			if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {
	NewFDiv = UndefValue::get(VT);			NewFDiv = UndefValue::get(VT);

	// FIXME: Doesn't do the right thing for cases where the vector is partially			// FIXME: Doesn't do the right thing for cases where the vector is partially
	// constant. This works when the scalarizer pass is run first.			// constant. This works when the scalarizer pass is run first.
	for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {			for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
	Value *NumEltI = Builder.CreateExtractElement(Num, I);			Value *NumEltI = Builder.CreateExtractElement(Num, I);
	Value *DenEltI = Builder.CreateExtractElement(Den, I);			Value *DenEltI = Builder.CreateExtractElement(Den, I);
	Value *NewElt = nullptr;			// Try rcp first.
	if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,			Value *NewElt = lowerUsingRcp(NumEltI, DenEltI, CanReassociateFDiv,
	HasFP32Denormals)) {			RcpIsAccurate, Builder, FPMath, Mod);
	Function *Decl =			if (!NewElt) // Try fdiv.fast.
	Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);			NewElt = lowerUsingFDivFast(NumEltI, DenEltI, ReqdAccuracy,
	NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);			HasFP32Denormals, Builder, FPMath, Mod);
	}			if (!NewElt) // Keep the original.
	if (!NewElt) // Try rcp.
	NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
	FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
	if (!NewElt)
	NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);			NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);

	NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);			NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
	}			}
	} else { // Scalar.			} else { // Scalar FDiv.
	if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,			// Try rcp first.
	HasFP32Denormals)) {			NewFDiv = lowerUsingRcp(Num, Den, CanReassociateFDiv, RcpIsAccurate,
	Function *Decl =			Builder, FPMath, Mod);
	Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);			if (!NewFDiv) { // Try fdiv.fast.
	NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);			NewFDiv = lowerUsingFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
	}			Builder, FPMath, Mod);
	if (!NewFDiv) { // Try rcp.
	NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
	Mod, HasFP32Denormals, NeedHighAccuracy);
	}			}
	}			}

	if (NewFDiv) {			if (NewFDiv) {
	FDiv.replaceAllUsesWith(NewFDiv);			FDiv.replaceAllUsesWith(NewFDiv);
	NewFDiv->takeName(&FDiv);			NewFDiv->takeName(&FDiv);
	FDiv.eraseFromParent();			FDiv.eraseFromParent();
	}			}
	Show All 12 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

	Show All 12 Lines
	SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,			SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
	SelectionDAG &DAG) const {			SelectionDAG &DAG) const {
	SDLoc SL(Op);			SDLoc SL(Op);
	SDValue LHS = Op.getOperand(0);			SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);			SDValue RHS = Op.getOperand(1);
	EVT VT = Op.getValueType();			EVT VT = Op.getValueType();
	const SDNodeFlags Flags = Op->getFlags();			const SDNodeFlags Flags = Op->getFlags();

	bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath \|\|			bool CanReassociateFDiv = DAG.getTarget().Options.UnsafeFPMath \|\|
	(Flags.hasAllowReciprocal() &&			Flags.hasAllowReciprocal();
	((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) \|\|
	VT == MVT::f16 \|\|			// Without !fpmath accuracy information, we can't do more because we don't
	Flags.hasApproximateFuncs()));			// know exactly whether rcp is accurate enough to meet !fpmath requirement.
				if (!CanReassociateFDiv)
	// Do rcp optimization only when fast unsafe rcp is legal here.
	// NOTE: We already performed RCP optimization to insert intrinsics in
	// AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
	// rcp optimization.
	// However, there are cases like FREM, which is expended into a sequence
	// of instructions including FDIV, which may expose new opportunities.
	if (!FastUnsafeRcpLegal)
	return SDValue();			return SDValue();

	if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {			if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
	if (CLHS->isExactlyValue(1.0)) {			if (CLHS->isExactlyValue(1.0)) {
	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to			// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
	// the CI documentation has a worst case error of 1 ulp.			// the CI documentation has a worst case error of 1 ulp.
	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to			// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
	// use it as long as we aren't trying to use denormals.			// use it as long as we aren't trying to use denormals.
	Show All 12 Lines

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

	Show All 12 Lines

	ret void			ret void
	}			}

	; CHECK-LABEL: @rcp_fdiv_fpmath(			; CHECK-LABEL: @rcp_fdiv_fpmath(
	; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}			; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
	; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)			; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)
	; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1			; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
	; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x			; CHECK: %arcp.no.md = call arcp float @llvm.amdgcn.rcp.f32(float %x)
	; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)			; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)
	; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)			; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
	; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)			; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
	define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {			define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
	%no.md = fdiv float 1.0, %x			%no.md = fdiv float 1.0, %x
	store volatile float %no.md, float addrspace(1)* %out			store volatile float %no.md, float addrspace(1)* %out

	%md.25ulp = fdiv float 1.0, %x, !fpmath !0			%md.25ulp = fdiv float 1.0, %x, !fpmath !0
	Show All 12 Lines
	store volatile float %fast.no.md, float addrspace(1)* %out			store volatile float %fast.no.md, float addrspace(1)* %out

	%fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0			%fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
	store volatile float %fast.25ulp, float addrspace(1)* %out			store volatile float %fast.25ulp, float addrspace(1)* %out

	ret void			ret void
	}			}

	; CHECK-LABEL: @rcp_fdiv_arcp_denormal(
	; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0
	; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2
	; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
	; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
	define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 {

	%arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0
	store volatile float %arcp.low.accuracy, float addrspace(1)* %out

	%arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2
	store volatile float %arcp.high.accuracy, float addrspace(1)* %out

	%arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0
	store volatile float %arcp.low.afn, float addrspace(1)* %out

	%arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2
	store volatile float %arcp.high.afn, float addrspace(1)* %out

	ret void
	}

	; CHECK-LABEL: @fdiv_fpmath_vector(			; CHECK-LABEL: @fdiv_fpmath_vector(
	; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0			; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
	; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0			; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
	; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float %[[NO_A0]], %[[NO_B0]]			; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float %[[NO_A0]], %[[NO_B0]]
	; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0			; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
	; CHECK: %[[NO_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1			; CHECK: %[[NO_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
	; CHECK: %[[NO_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1			; CHECK: %[[NO_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
	; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float %[[NO_A1]], %[[NO_B1]]			; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float %[[NO_A1]], %[[NO_B1]]
	Show All 24 Lines
	; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1			; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1
	; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0			; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
	; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1			; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1			; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1
	; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1			; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
	; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out			; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out

	; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0			; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
	; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]			; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO0]])
	; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0			; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
	; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1			; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO1]]			; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO1]])
	; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1			; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
	; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out			; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out

	; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0			; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
	; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])			; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
	; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0			; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0
	; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1			; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: %[[FAST_NO_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])			; CHECK: %[[FAST_NO_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
	Show All 24 Lines
	; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]]			; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]]
	; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0			; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
	; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1			; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 2.000000e+00, %[[NO1]]			; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 2.000000e+00, %[[NO1]]
	; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1			; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
	; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out			; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

	; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0			; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
	; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]			; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO0]])
	; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0			; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
	; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1			; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 2.000000e+00, %[[ARCP_NO1]]			; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO1]])
	; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1			; CHECK: %[[ARCP_NO_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_NO_FDIV1]]
				; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_MUL1]], i64 1
	; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out			; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out

	; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0			; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
	; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])			; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
	; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0			; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0
	; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1			; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
	; CHECK: %[[FAST_NO_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])			; CHECK: %[[FAST_NO_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
	; CHECK: %[[FAST_NO_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_NO_RCP1]]			; CHECK: %[[FAST_NO_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_NO_RCP1]]
	Show All 24 Lines
	; CHECK-LABEL: @fdiv_fpmath_f32_denormals(			; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
	; CHECK: %no.md = fdiv float %a, %b{{$}}			; CHECK: %no.md = fdiv float %a, %b{{$}}
	; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1			; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
	; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2			; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
	; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0			; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
	; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3			; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
	; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)			; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
	; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0			; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0
	; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0			; CHECK: %[[RCP_ARCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b)
				; CHECK: %arcp.md.25ulp = fmul arcp float %a, %[[RCP_ARCP]], !fpmath !0
	define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {			define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
	%no.md = fdiv float %a, %b			%no.md = fdiv float %a, %b
	store volatile float %no.md, float addrspace(1)* %out			store volatile float %no.md, float addrspace(1)* %out

	%md.half.ulp = fdiv float %a, %b, !fpmath !1			%md.half.ulp = fdiv float %a, %b, !fpmath !1
	store volatile float %md.half.ulp, float addrspace(1)* %out			store volatile float %md.half.ulp, float addrspace(1)* %out

	%md.1ulp = fdiv float %a, %b, !fpmath !2			%md.1ulp = fdiv float %a, %b, !fpmath !2
	Show All 12 Lines

llvm/test/CodeGen/AMDGPU/fdiv.f16.ll

	Show All 12 Lines
	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
	define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {	define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
	entry:	entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()	%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%tid.ext = sext i32 %tid to i64	%tid.ext = sext i32 %tid to i64
	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
	%b.val = load volatile half, half addrspace(1)* %gep.b	%b.val = load volatile half, half addrspace(1)* %gep.b
	%r.val = fdiv half 1.0, %b.val	%r.val = fdiv half 1.0, %b.val, !fpmath !0
	store half %r.val, half addrspace(1)* %gep.r	store half %r.val, half addrspace(1)* %gep.r
	ret void	ret void
	}	}

	; GCN-LABEL: {{^}}v_rcp_f16_abs:	; GCN-LABEL: {{^}}v_rcp_f16_abs:
	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]
	; GFX8_9_10-NOT: [[VAL]]	; GFX8_9_10-NOT: [[VAL]]
	; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], \|[[VAL]]\|	; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], \|[[VAL]]\|
	; GFX8_9_10-NOT: [RESULT]]	; GFX8_9_10-NOT: [RESULT]]
	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
	define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {	define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
	entry:	entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()	%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%tid.ext = sext i32 %tid to i64	%tid.ext = sext i32 %tid to i64
	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
	%b.val = load volatile half, half addrspace(1)* %gep.b	%b.val = load volatile half, half addrspace(1)* %gep.b
	%b.abs = call half @llvm.fabs.f16(half %b.val)	%b.abs = call half @llvm.fabs.f16(half %b.val)
	%r.val = fdiv half 1.0, %b.abs	%r.val = fdiv half 1.0, %b.abs, !fpmath !0
		store half %r.val, half addrspace(1)* %gep.r
		ret void
		}

		; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.

		; GCN-LABEL: {{^}}reciprocal_f16_rounded:
		; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
		; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
		; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
		; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
		; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
		; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
		define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
		entry:
		%tid = call i32 @llvm.amdgcn.workitem.id.x()
		%tid.ext = sext i32 %tid to i64
		%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
		%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
		%b.val = load volatile half, half addrspace(1)* %gep.b
		%r.val = fdiv half 1.0, %b.val
	store half %r.val, half addrspace(1)* %gep.r	store half %r.val, half addrspace(1)* %gep.r
	ret void	ret void
	}	}

	; GCN-LABEL: {{^}}v_rcp_f16_arcp:	; GCN-LABEL: {{^}}v_rcp_f16_arcp:
	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]
	; GFX8_9_10-NOT: [[VAL]]	; GFX8_9_10-NOT: [[VAL]]
	; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]	; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
	; GFX8_9_10-NOT: [[RESULT]]	; GFX8_9_10-NOT: [[RESULT]]
	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
	define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {	define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
	entry:	entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()	%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%tid.ext = sext i32 %tid to i64	%tid.ext = sext i32 %tid to i64
	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
	%b.val = load volatile half, half addrspace(1)* %gep.b	%b.val = load volatile half, half addrspace(1)* %gep.b
	%r.val = fdiv arcp half 1.0, %b.val	%r.val = fdiv arcp half 1.0, %b.val, !fpmath !0
	store half %r.val, half addrspace(1)* %gep.r	store half %r.val, half addrspace(1)* %gep.r
	ret void	ret void
	}	}

	; GCN-LABEL: {{^}}v_rcp_f16_neg:	; GCN-LABEL: {{^}}v_rcp_f16_neg:
	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]
	; GFX8_9_10-NOT: [[VAL]]	; GFX8_9_10-NOT: [[VAL]]
	; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]	; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
	; GFX8_9_10-NOT: [RESULT]]	; GFX8_9_10-NOT: [RESULT]]
	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
	define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {	define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
	entry:	entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()	%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%tid.ext = sext i32 %tid to i64	%tid.ext = sext i32 %tid to i64
	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
	%b.val = load volatile half, half addrspace(1)* %gep.b	%b.val = load volatile half, half addrspace(1)* %gep.b
	%r.val = fdiv half -1.0, %b.val	%r.val = fdiv half -1.0, %b.val, !fpmath !0
	store half %r.val, half addrspace(1)* %gep.r	store half %r.val, half addrspace(1)* %gep.r
	ret void	ret void
	}	}

	; GCN-LABEL: {{^}}v_rsq_f16:	; GCN-LABEL: {{^}}v_rsq_f16:
	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]
	; GFX8_9_10-NOT: [[VAL]]	; GFX8_9_10-NOT: [[VAL]]
	; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]	; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
	; GFX8_9_10-NOT: [RESULT]]	; GFX8_9_10-NOT: [RESULT]]
	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
	define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {	define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
	entry:	entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()	%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%tid.ext = sext i32 %tid to i64	%tid.ext = sext i32 %tid to i64
	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
	%b.val = load volatile half, half addrspace(1)* %gep.b	%b.val = load volatile half, half addrspace(1)* %gep.b
	%b.sqrt = call half @llvm.sqrt.f16(half %b.val)	%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
	%r.val = fdiv half 1.0, %b.sqrt	%r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
	store half %r.val, half addrspace(1)* %gep.r	store half %r.val, half addrspace(1)* %gep.r
	ret void	ret void
	}	}

	; GCN-LABEL: {{^}}v_rsq_f16_neg:	; GCN-LABEL: {{^}}v_rsq_f16_neg:
	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[VAL:v[0-9]+]]
	; GFX8_9_10-NOT: [[VAL]]	; GFX8_9_10-NOT: [[VAL]]
	; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]	; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
	; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]	; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
	; GFX8_9_10-NOT: [RESULT]]	; GFX8_9_10-NOT: [RESULT]]
	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]	; GFX8_9_10: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
	define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {	define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
	entry:	entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()	%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%tid.ext = sext i32 %tid to i64	%tid.ext = sext i32 %tid to i64
	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext	%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext	%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
	%b.val = load volatile half, half addrspace(1)* %gep.b	%b.val = load volatile half, half addrspace(1)* %gep.b
	%b.sqrt = call half @llvm.sqrt.f16(half %b.val)	%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
	%r.val = fdiv half -1.0, %b.sqrt	%r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
	store half %r.val, half addrspace(1)* %gep.r	store half %r.val, half addrspace(1)* %gep.r
	ret void	ret void
	}	}

	; GCN-LABEL: {{^}}v_fdiv_f16_arcp:	; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
	; GFX8_9_10: {{flat\|global}}_load_ushort [[LHS:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[LHS:v[0-9]+]]
	; GFX8_9_10: {{flat\|global}}_load_ushort [[RHS:v[0-9]+]]	; GFX8_9_10: {{flat\|global}}_load_ushort [[RHS:v[0-9]+]]

	Show All 24 Lines

	declare i32 @llvm.amdgcn.workitem.id.x() #1	declare i32 @llvm.amdgcn.workitem.id.x() #1
	declare half @llvm.sqrt.f16(half) #1	declare half @llvm.sqrt.f16(half) #1
	declare half @llvm.fabs.f16(half) #1	declare half @llvm.fabs.f16(half) #1

	attributes #0 = { nounwind }	attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }	attributes #1 = { nounwind readnone }
	attributes #2 = { nounwind "unsafe-fp-math"="true" }	attributes #2 = { nounwind "unsafe-fp-math"="true" }

		!0 = !{float 2.500000e+00}
Context not available.

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 241807

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

llvm/test/CodeGen/AMDGPU/fdiv.f16.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 241807

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll

llvm/test/CodeGen/AMDGPU/fdiv.f16.ll

AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare
ClosedPublic