This is an archive of the discontinued LLVM Phabricator instance.

[Inliner] Restrict soft-float inlining penalty.
ClosedPublic

Authored by efriedma on Dec 21 2017, 2:42 PM.

Download Raw Diff

Details

Reviewers

haicheng
chandlerc
mcrosier
echristo

Commits

rG39ed9a602bff: [Inliner] Restrict soft-float inlining penalty.
rL321332: [Inliner] Restrict soft-float inlining penalty.

Summary

The penalty is currently getting applied in a bunch of places where it doesn't make sense, like bitcasts (which are free) and calls (which were getting the call penalty applied twice). Instead, just apply the penalty to binary operators and floating-point casts.

While I'm here, also fix getFPOpCost() to do the right thing in more cases, so we don't have to dig into function attributes.

(Not sure if I should also apply this to fcmp instructions.)

Diff Detail

Repository: rL LLVM

Event Timeline

efriedma created this revision.Dec 21 2017, 2:42 PM

Herald added a subscriber: eraman. · View Herald TranscriptDec 21 2017, 2:42 PM

echristo added inline comments.Dec 21 2017, 3:57 PM

lib/Analysis/InlineCost.cpp
1103 ↗	(On Diff #127945)	I realize you're just moving this code, but we might want to make this a TTI backend call for the function since there's also the subtarget feature on some targets as well. Would it make since for this to be a part of getFPOpCost instead?

Fixed getFPOpCost().

Herald added a subscriber: javed.absar. · View Herald TranscriptDec 21 2017, 5:42 PM

efriedma added inline comments.Dec 21 2017, 5:55 PM

lib/Analysis/InlineCost.cpp
1103 ↗	(On Diff #127945)	I think this code was originally written before we had per-function TTI... but we should take advantage of it now. Fixed the default implementation of getFPOpCost to do the right thing, and removed the explicit check for "use-soft-float".

LGTM, thanks!

This revision is now accepted and ready to land.Dec 21 2017, 5:58 PM

Closed by commit rL321332: [Inliner] Restrict soft-float inlining penalty. (authored by efriedma). · Explain WhyDec 21 2017, 6:08 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

CodeGen/

BasicTTIImpl.h

10 lines

lib/

Analysis/

InlineCost.cpp

34 lines

Target/

ARM/

ARMTargetTransformInfo.h

2 lines

ARMTargetTransformInfo.cpp

19 lines

test/

Transforms/

Inline/

ARM/

inline-fp.ll

113 lines

Diff 127976

llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 296 Lines • ▼ Show 20 Lines	return TLI->isTypeLegal(VT) &&
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);		TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
}		}

bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {		bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
return true;		return true;
}		}

unsigned getFPOpCost(Type *Ty) {		unsigned getFPOpCost(Type *Ty) {
// By default, FP instructions are no more expensive since they are		// Check whether FADD is available, as a proxy for floating-point in
// implemented in HW. Target specific TTI can override this.		// general.
		const TargetLoweringBase *TLI = getTLI();
		EVT VT = TLI->getValueType(DL, Ty);
		if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
return TargetTransformInfo::TCC_Basic;		return TargetTransformInfo::TCC_Basic;
		return TargetTransformInfo::TCC_Expensive;
}		}

unsigned getOperationCost(unsigned Opcode, Type Ty, Type OpTy) {		unsigned getOperationCost(unsigned Opcode, Type Ty, Type OpTy) {
const TargetLoweringBase *TLI = getTLI();		const TargetLoweringBase *TLI = getTLI();
switch (Opcode) {		switch (Opcode) {
default: break;		default: break;
case Instruction::Trunc:		case Instruction::Trunc:
if (TLI->isTruncateFree(OpTy, Ty))		if (TLI->isTruncateFree(OpTy, Ty))
▲ Show 20 Lines • Show All 987 Lines • Show Last 20 Lines

llvm/trunk/lib/Analysis/InlineCost.cpp

Show First 20 Lines • Show All 695 Lines • ▼ Show 20 Lines	bool CallAnalyzer::visitCastInst(CastInst &I) {
if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {		if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());		return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
}))		}))
return true;		return true;

// Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.		// Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
disableSROA(I.getOperand(0));		disableSROA(I.getOperand(0));

		// If this is a floating-point cast, and the target says this operation
		// is expensive, this may eventually become a library call. Treat the cost
		// as such.
		switch (I.getOpcode()) {
		case Instruction::FPTrunc:
		case Instruction::FPExt:
		case Instruction::UIToFP:
		case Instruction::SIToFP:
		case Instruction::FPToUI:
		case Instruction::FPToSI:
		if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
		Cost += InlineConstants::CallPenalty;
		default:
		break;
		}

return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);		return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
}		}

bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {		bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
Value *Operand = I.getOperand(0);		Value *Operand = I.getOperand(0);
if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {		if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
return ConstantFoldInstOperands(&I, COps[0], DL);		return ConstantFoldInstOperands(&I, COps[0], DL);
}))		}))
▲ Show 20 Lines • Show All 362 Lines • ▼ Show 20 Lines	bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {

if (simplifyInstruction(I, Evaluate))		if (simplifyInstruction(I, Evaluate))
return true;		return true;

// Disable any SROA on arguments to arbitrary, unsimplified binary operators.		// Disable any SROA on arguments to arbitrary, unsimplified binary operators.
disableSROA(LHS);		disableSROA(LHS);
disableSROA(RHS);		disableSROA(RHS);

		// If the instruction is floating point, and the target says this operation
		// is expensive, this may eventually become a library call. Treat the cost
		// as such.
		if (I.getType()->isFloatingPointTy() &&
		TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
		Cost += InlineConstants::CallPenalty;

return false;		return false;
}		}

bool CallAnalyzer::visitLoad(LoadInst &I) {		bool CallAnalyzer::visitLoad(LoadInst &I) {
Value *SROAArg;		Value *SROAArg;
DenseMap<Value *, int>::iterator CostIt;		DenseMap<Value *, int>::iterator CostIt;
if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {		if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
if (I.isSimple()) {		if (I.isSimple()) {
▲ Show 20 Lines • Show All 453 Lines • ▼ Show 20 Lines	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
// Skip ephemeral values.		// Skip ephemeral values.
if (EphValues.count(&*I))		if (EphValues.count(&*I))
continue;		continue;

++NumInstructions;		++NumInstructions;
if (isa<ExtractElementInst>(I) \|\| I->getType()->isVectorTy())		if (isa<ExtractElementInst>(I) \|\| I->getType()->isVectorTy())
++NumVectorInstructions;		++NumVectorInstructions;

// If the instruction is floating point, and the target says this operation
// is expensive or the function has the "use-soft-float" attribute, this may
// eventually become a library call. Treat the cost as such.
if (I->getType()->isFloatingPointTy()) {
// If the function has the "use-soft-float" attribute, mark it as
// expensive.
if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive \|\|
(F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
Cost += InlineConstants::CallPenalty;
}

// If the instruction simplified to a constant, there is no cost to this		// If the instruction simplified to a constant, there is no cost to this
// instruction. Visit the instructions using our InstVisitor to account for		// instruction. Visit the instructions using our InstVisitor to account for
// all of the per-instruction logic. The visit tree returns true if we		// all of the per-instruction logic. The visit tree returns true if we
// consumed the instruction in any way, and false if the instruction's base		// consumed the instruction in any way, and false if the instruction's base
// cost should count against inlining.		// cost should count against inlining.
if (Base::visit(&*I))		if (Base::visit(&*I))
++NumInstructionsSimplified;		++NumInstructionsSimplified;
else		else
▲ Show 20 Lines • Show All 552 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h

Show First 20 Lines • Show All 150 Lines • ▼ Show 20 Lines	public:
int getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,		int getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,
const Instruction *I = nullptr);		const Instruction *I = nullptr);

int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);		int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);

int getAddressComputationCost(Type Val, ScalarEvolution SE,		int getAddressComputationCost(Type Val, ScalarEvolution SE,
const SCEV *Ptr);		const SCEV *Ptr);

int getFPOpCost(Type *Ty);

int getArithmeticInstrCost(		int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,		unsigned Opcode, Type *Ty,
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,		TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,		TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,		TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,		TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value > Args = ArrayRef<const Value >());		ArrayRef<const Value > Args = ArrayRef<const Value >());

Show All 25 Lines

llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp

Show First 20 Lines • Show All 388 Lines • ▼ Show 20 Lines	if (Ty->isVectorTy() && SE &&
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))		!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
return NumVectorInstToHideOverhead;		return NumVectorInstToHideOverhead;

// In many cases the address computation is not merged into the instruction		// In many cases the address computation is not merged into the instruction
// addressing mode.		// addressing mode.
return 1;		return 1;
}		}

int ARMTTIImpl::getFPOpCost(Type *Ty) {
// Use similar logic that's in ARMISelLowering:
// Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
// to VFP.

if (ST->hasVFP2() && !ST->isThumb1Only()) {
if (Ty->isFloatTy()) {
return TargetTransformInfo::TCC_Basic;
}

if (Ty->isDoubleTy()) {
return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
TargetTransformInfo::TCC_Basic;
}
}

return TargetTransformInfo::TCC_Expensive;
}

int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,		int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {		Type *SubTp) {
// We only handle costs of reverse and alternate shuffles for now.		// We only handle costs of reverse and alternate shuffles for now.
if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)		if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);		return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);

if (Kind == TTI::SK_Reverse) {		if (Kind == TTI::SK_Reverse) {
static const CostTblEntry NEONShuffleTbl[] = {		static const CostTblEntry NEONShuffleTbl[] = {
▲ Show 20 Lines • Show All 222 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/Inline/ARM/inline-fp.ll

				; RUN: opt -S -inline -mtriple=arm-eabi -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 \| FileCheck %s -check-prefix=NOFP
				; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2 -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 \| FileCheck %s -check-prefix=FULLFP
				; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2,+fp-only-sp -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 \| FileCheck %s -check-prefix=SINGLEFP
				; Make sure that soft float implementations are calculated as being more expensive
				; to the inliner.

				; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
				; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
				; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
				; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
				; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
				; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
				; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
				; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)

				; FULLFP-DAG: single inlined into test_single with cost=0 (threshold=75)
				; FULLFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
				; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
				; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
				; FULLFP-DAG: double inlined into test_double with cost=0 (threshold=75)
				; FULLFP-DAG: double inlined into test_double with cost=-15000 (threshold=75)
				; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
				; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)

				; SINGLEFP-DAG: single inlined into test_single with cost=0 (threshold=75)
				; SINGLEFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
				; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
				; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
				; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
				; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
				; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
				; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)

				define i32 @test_single(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
				%call = call float @single(i32 %a, i8 zeroext %b)
				%call2 = call float @single(i32 %c, i8 zeroext %d)
				ret i32 0
				}

				define i32 @test_single_cheap(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
				%call = call float @single_cheap(i32 %a, i8 zeroext %b)
				%call2 = call float @single_cheap(i32 %c, i8 zeroext %d)
				ret i32 0
				}

				define i32 @test_double(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
				%call = call double @double(i32 %a, i8 zeroext %b)
				%call2 = call double @double(i32 %c, i8 zeroext %d)
				ret i32 0
				}

				define i32 @test_single_force_soft(i32 %a, i8 %b, i32 %c, i8 %d) #1 {
				%call = call float @single_force_soft(i32 %a, i8 zeroext %b) #1
				%call2 = call float @single_force_soft(i32 %c, i8 zeroext %d) #1
				ret i32 0
				}

				define internal float @single(i32 %response, i8 zeroext %value1) #0 {
				entry:
				%conv = zext i8 %value1 to i32
				%sub = add nsw i32 %conv, -1
				%conv1 = sitofp i32 %sub to float
				%0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
				%mul = fmul float %0, 2.620000e+03
				%conv2 = sitofp i32 %response to float
				%sub3 = fsub float %conv2, %mul
				%div = fdiv float %sub3, %mul
				ret float %div
				}

				define internal float @single_cheap(i32 %response, i8 zeroext %value1) #0 {
				entry:
				%conv = zext i8 %value1 to i32
				%sub = add nsw i32 %conv, -1
				%conv1 = bitcast i32 %sub to float
				%conv2 = bitcast i32 %response to float
				%0 = tail call float @llvm.pow.f32(float %conv2, float %conv1)
				%1 = tail call float @llvm.pow.f32(float %0, float %0)
				%2 = tail call float @llvm.pow.f32(float %1, float %1)
				ret float %2
				}

				define internal double @double(i32 %response, i8 zeroext %value1) #0 {
				entry:
				%conv = zext i8 %value1 to i32
				%sub = add nsw i32 %conv, -1
				%conv1 = sitofp i32 %sub to double
				%0 = tail call double @llvm.pow.f64(double 0x3FF028F5C0000000, double %conv1)
				%mul = fmul double %0, 2.620000e+03
				%conv2 = sitofp i32 %response to double
				%sub3 = fsub double %conv2, %mul
				%div = fdiv double %sub3, %mul
				ret double %div
				}

				define internal float @single_force_soft(i32 %response, i8 zeroext %value1) #1 {
				entry:
				%conv = zext i8 %value1 to i32
				%sub = add nsw i32 %conv, -1
				%conv1 = sitofp i32 %sub to float
				%0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
				%mul = fmul float %0, 2.620000e+03
				%conv2 = sitofp i32 %response to float
				%sub3 = fsub float %conv2, %mul
				%div = fdiv float %sub3, %mul
				ret float %div
				}

				declare float @llvm.pow.f32(float, float) optsize minsize
				declare double @llvm.pow.f64(double, double) optsize minsize

				attributes #0 = { optsize }
				attributes #1 = { optsize "use-soft-float"="true" "target-features"="+soft-float" }