diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -338,13 +338,16 @@ InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale, unsigned AddrSpace) { + int64_t Scale, + unsigned AddrSpace) const { TargetLoweringBase::AddrMode AM; AM.BaseGV = BaseGV; AM.BaseOffs = BaseOffset; AM.HasBaseReg = HasBaseReg; AM.Scale = Scale; - return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) + return 0; + return -1; } bool isTruncateFree(Type *Ty1, Type *Ty2) { @@ -363,7 +366,7 @@ } InstructionCost getRegUsageForType(Type *Ty) { - InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; + InstructionCost Val = getTypeLegalizationCost(Ty).first; assert(Val >= 0 && "Negative cost!"); return Val; } @@ -752,6 +755,41 @@ return Cost; } + /// Estimate the cost of type-legalization and the legalized type. + std::pair getTypeLegalizationCost(Type *Ty) const { + LLVMContext &C = Ty->getContext(); + EVT MTy = getTLI()->getValueType(DL, Ty); + + InstructionCost Cost = 1; + // We keep legalizing the type until we find a legal kind. We assume that + // the only operation that costs anything is the split. After splitting + // we need to handle two types. + while (true) { + TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); + + if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { + // Ensure we return a sensible simple VT here, since many callers of + // this function require it. + MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; + return std::make_pair(InstructionCost::getInvalid(), VT); + } + + if (LK.first == TargetLoweringBase::TypeLegal) + return std::make_pair(Cost, MTy.getSimpleVT()); + + if (LK.first == TargetLoweringBase::TypeSplitVector || + LK.first == TargetLoweringBase::TypeExpandInteger) + Cost *= 2; + + // Do not loop with f128 type. + if (MTy == LK.second) + return std::make_pair(Cost, MTy.getSimpleVT()); + + // Keep legalizing the type. + MTy = LK.second; + } + } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } InstructionCost getArithmeticInstrCost( @@ -774,7 +812,7 @@ Opd1PropInfo, Opd2PropInfo, Args, CxtI); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); bool IsFloat = Ty->isFPOrFPVectorTy(); // Assume that floating point arithmetic operations cost twice as much as @@ -907,10 +945,8 @@ const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair SrcLT = - TLI->getTypeLegalizationCost(DL, Src); - std::pair DstLT = - TLI->getTypeLegalizationCost(DL, Dst); + std::pair SrcLT = getTypeLegalizationCost(Src); + std::pair DstLT = getTypeLegalizationCost(Dst); TypeSize SrcSize = SrcLT.second.getSizeInBits(); TypeSize DstSize = DstLT.second.getSizeInBits(); @@ -1005,7 +1041,7 @@ // If we are legalizing by splitting, query the concrete TTI for the cost // of casting the original vector twice. We also need to factor in the // cost of the split itself. Count that as 1, to be consistent with - // TLI->getTypeLegalizationCost(). + // getTypeLegalizationCost(). bool SplitSrc = TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == TargetLowering::TypeSplitVector; @@ -1086,8 +1122,7 @@ if (CondTy->isVectorTy()) ISD = ISD::VSELECT; } - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); if (!(ValTy->isVectorTy() && !LT.second.isVector()) && !TLI->isOperationExpand(ISD, LT.second)) { @@ -1117,10 +1152,7 @@ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); - - return LT.first; + return getRegUsageForType(Val->getScalarType()); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, @@ -1164,8 +1196,7 @@ // Assume types, such as structs, are expensive. if (getTLI()->getValueType(DL, Src, true) == MVT::Other) return 4; - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); // Assuming that all loads of legal types cost 1. InstructionCost Cost = LT.first; @@ -1239,7 +1270,7 @@ // Legalize the vector type, and get the legalized and unlegalized type // sizes. - MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + MVT VecTyLT = getTypeLegalizationCost(VecTy).second; unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); unsigned VecTyLTSize = VecTyLT.getStoreSize(); @@ -1918,8 +1949,7 @@ } const TargetLoweringBase *TLI = getTLI(); - std::pair LT = - TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = getTypeLegalizationCost(RetTy); SmallVector LegalCost; SmallVector CustomCost; @@ -2028,8 +2058,7 @@ } unsigned getNumberOfParts(Type *Tp) { - std::pair LT = - getTLI()->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); return LT.first.isValid() ? *LT.first.getValue() : 0; } @@ -2082,8 +2111,7 @@ unsigned NumReduxLevels = Log2_32(NumVecElts); InstructionCost ArithCost = 0; InstructionCost ShuffleCost = 0; - std::pair LT = - thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); + std::pair LT = thisT()->getTypeLegalizationCost(Ty); unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; @@ -2173,8 +2201,7 @@ } InstructionCost MinMaxCost = 0; InstructionCost ShuffleCost = 0; - std::pair LT = - thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); + std::pair LT = thisT()->getTypeLegalizationCost(Ty); unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -49,7 +49,6 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/InstructionCost.h" #include "llvm/Support/MachineValueType.h" #include #include @@ -952,6 +951,7 @@ return ValueTypeActions; } + LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; /// Return how we should legalize values of this type, either it is already /// legal (return 'Legal') or we need to promote it to a larger type (return /// 'Promote'), or we need to expand it into multiple registers of smaller @@ -1882,10 +1882,6 @@ /// Get the ISD node that corresponds to the Instruction class opcode. int InstructionOpcodeToISD(unsigned Opcode) const; - /// Estimate the cost of type-legalization and the legalized type. - std::pair getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const; - /// @} //===--------------------------------------------------------------------===// @@ -2415,22 +2411,6 @@ Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; - /// Return the cost of the scaling factor used in the addressing mode - /// represented by AM for this target, for a load/store of the specified type. - /// - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - /// TODO: Handle pre/postinc as well. - /// TODO: Remove default argument - virtual InstructionCost getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS = 0) const { - // Default: assume that any scaling factor used in a legal AM is free. - if (isLegalAddressingMode(DL, AM, Ty, AS)) - return 0; - return -1; - } - /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. @@ -3117,8 +3097,6 @@ ValueTypeActionImpl ValueTypeActions; private: - LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; - /// Targets can specify ISD nodes that they would like PerformDAGCombine /// callbacks for by calling setTargetDAGCombine(), which sets a bit in this /// array. diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1844,41 +1844,6 @@ llvm_unreachable("Unknown instruction type encountered!"); } -std::pair -TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const { - LLVMContext &C = Ty->getContext(); - EVT MTy = getValueType(DL, Ty); - - InstructionCost Cost = 1; - // We keep legalizing the type until we find a legal kind. We assume that - // the only operation that costs anything is the split. After splitting - // we need to handle two types. - while (true) { - LegalizeKind LK = getTypeConversion(C, MTy); - - if (LK.first == TypeScalarizeScalableVector) { - // Ensure we return a sensible simple VT here, since many callers of this - // function require it. - MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; - return std::make_pair(InstructionCost::getInvalid(), VT); - } - - if (LK.first == TypeLegal) - return std::make_pair(Cost, MTy.getSimpleVT()); - - if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger) - Cost *= 2; - - // Do not loop with f128 type. - if (MTy == LK.second) - return std::make_pair(Cost, MTy.getSimpleVT()); - - // Keep legalizing the type. - MTy = LK.second; - } -} - Value * TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilderBase &IRB, bool UseTLS) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -619,14 +619,6 @@ unsigned AS, Instruction *I = nullptr) const override; - /// Return the cost of the scaling factor used in the addressing - /// mode represented by AM for this target, for a load/store - /// of the specified type. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - InstructionCost getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, - Type *Ty, unsigned AS) const override; - /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12820,22 +12820,6 @@ return true; } -InstructionCost AArch64TargetLowering::getScalingFactorCost( - const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { - // Scaling factors are not free at all. - // Operands | Rt Latency - // ------------------------------------------- - // Rt, [Xn, Xm] | 4 - // ------------------------------------------- - // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 - // Rt, [Xn, Wm, #imm] | - if (isLegalAddressingMode(DL, AM, Ty, AS)) - // Scale represents reg2 * scale, thus account for 1 if - // it is not equal to 0 or 1. - return AM.Scale != 0 && AM.Scale != 1; - return -1; -} - bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -331,6 +331,14 @@ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp); + /// Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const; /// @} }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -233,7 +233,7 @@ case Intrinsic::smax: { static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); // v2i64 types get converted to cmp+bif hence the cost of 2 if (LT.second == MVT::v2i64) return LT.first * 2; @@ -248,7 +248,7 @@ static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); // This is a base cost of 1 for the vadd, plus 3 extract shifts if we // need to extend the type, as it uses shr(qadd(shl, shl)). unsigned Instrs = @@ -261,14 +261,14 @@ static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) return LT.first; break; } case Intrinsic::experimental_stepvector: { InstructionCost Cost = 1; // Cost of the `index' instruction - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); // Legalisation of illegal vectors involves an `index' instruction plus // (LT.first - 1) vector adds. if (LT.first > 1) { @@ -292,7 +292,7 @@ {Intrinsic::bitreverse, MVT::v1i64, 2}, {Intrinsic::bitreverse, MVT::v2i64, 2}, }; - const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); + const auto LegalisationCost = getTypeLegalizationCost(RetTy); const auto *Entry = CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); if (Entry) { @@ -318,7 +318,7 @@ {ISD::CTPOP, MVT::v8i8, 1}, {ISD::CTPOP, MVT::i32, 5}, }; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + auto LT = getTypeLegalizationCost(RetTy); MVT MTy = LT.second; if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { // Extra cost of +1 when illegal vector types are legalized by promoting @@ -1300,7 +1300,7 @@ // Legalize the destination type and ensure it can be used in a widening // operation. - auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); + auto DstTyL = getTypeLegalizationCost(DstTy); unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) return false; @@ -1308,7 +1308,7 @@ // Legalize the source type and ensure it can be used in a widening // operation. auto *SrcTy = toVectorTy(Extend->getSrcTy()); - auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); + auto SrcTyL = getTypeLegalizationCost(SrcTy); unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) return false; @@ -1623,7 +1623,7 @@ getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); // Legalize the types. - auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); + auto VecLT = getTypeLegalizationCost(VecTy); auto DstVT = TLI->getValueType(DL, Dst); auto SrcVT = TLI->getValueType(DL, Src); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -1678,7 +1678,7 @@ if (Index != -1U) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair LT = getTypeLegalizationCost(Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -1713,7 +1713,7 @@ Opd2PropInfo, Args, CxtI); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), // add in the widening overhead specified by the sub-target. Since the @@ -1883,7 +1883,7 @@ static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}; - auto LT = TLI->getTypeLegalizationCost(DL, ValTy); + auto LT = getTypeLegalizationCost(ValTy); if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) return LT.first; } @@ -1937,7 +1937,7 @@ if (useNeonVector(Src)) return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); - auto LT = TLI->getTypeLegalizationCost(DL, Src); + auto LT = getTypeLegalizationCost(Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -1962,7 +1962,7 @@ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); auto *VT = cast(DataTy); - auto LT = TLI->getTypeLegalizationCost(DL, DataTy); + auto LT = getTypeLegalizationCost(DataTy); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -1999,7 +1999,7 @@ return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind); - auto LT = TLI->getTypeLegalizationCost(DL, Ty); + auto LT = getTypeLegalizationCost(Ty); if (!LT.first.isValid()) return InstructionCost::getInvalid(); @@ -2344,7 +2344,7 @@ AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); @@ -2368,7 +2368,7 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); InstructionCost LegalizationCost = 0; if (LT.first > 1) { Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); @@ -2417,7 +2417,7 @@ if (isa(ValTy)) return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -2508,7 +2508,7 @@ { TTI::SK_Splice, MVT::nxv2f64, 1 }, }; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; EVT PromotedVT = LT.second.getScalarType() == MVT::i1 @@ -2639,7 +2639,7 @@ { TTI::SK_Reverse, MVT::nxv4i1, 1 }, { TTI::SK_Reverse, MVT::nxv2i1, 1 }, }; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; } @@ -2647,3 +2647,26 @@ return getSpliceCost(Tp, Index); return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } + +InstructionCost +AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const { + // Scaling factors are not free at all. + // Operands | Rt Latency + // ------------------------------------------- + // Rt, [Xn, Xm] | 4 + // ------------------------------------------- + // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 + // Rt, [Xn, Wm, #imm] | + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) + // Scale represents reg2 * scale, thus account for 1 if + // it is not equal to 0 or 1. + return AM.Scale != 0 && AM.Scale != 1; + return -1; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -95,6 +95,8 @@ // quarter. This also applies to some integer operations. int get64BitInstrCost(TTI::TargetCostKind CostKind) const; + std::pair getTypeLegalizationCost(Type *Ty) const; + public: explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -521,7 +521,7 @@ const Instruction *CxtI) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); // Because we don't have any legal vector operations, but the legal types, we @@ -693,7 +693,7 @@ Type *RetTy = ICA.getReturnType(); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = getTypeLegalizationCost(RetTy); unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; @@ -772,7 +772,7 @@ if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); return LT.first * getFullRateInstrCost(); } @@ -787,7 +787,7 @@ if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); return LT.first * getHalfRateInstrCost(CostKind); } @@ -1150,3 +1150,17 @@ : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) : getQuarterRateInstrCost(CostKind); } + +std::pair +GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { + std::pair Cost = BaseT::getTypeLegalizationCost(Ty); + auto Size = DL.getTypeSizeInBits(Ty); + // Maximum load or store can handle 8 dwords for scalar and 4 for + // vector ALU. Let's assume anything above 8 dwords is expensive + // even if legal. + if (Size <= 256) + return Cost; + + Cost.first += (Size + 255) / 256; + return Cost; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -497,9 +497,6 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; - - std::pair getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12501,20 +12501,4 @@ } SmallPtrSet Visited; return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); -} - -std::pair -SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const { - std::pair Cost = - TargetLoweringBase::getTypeLegalizationCost(DL, Ty); - auto Size = DL.getTypeSizeInBits(Ty); - // Maximum load or store can handle 8 dwords for scalar and 4 for - // vector ALU. Let's assume anything above 8 dwords is expensive - // even if legal. - if (Size <= 256) - return Cost; - - Cost.first += (Size + 255) / 256; - return Cost; -} +} \ No newline at end of file diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -470,14 +470,6 @@ Type *Ty, unsigned AS, Instruction *I = nullptr) const override; - /// getScalingFactorCost - Return the cost of the scaling used in - /// addressing mode represented by AM. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, the return value must be negative. - InstructionCost getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS) const override; - bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// Returns true if the addressing mode representing by AM is legal diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19043,18 +19043,6 @@ return true; } -InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, - Type *Ty, - unsigned AS) const { - if (isLegalAddressingMode(DL, AM, Ty, AS)) { - if (Subtarget->hasFPAO()) - return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster - return 0; - } - return -1; -} - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -281,6 +281,14 @@ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); + /// getScalingFactorCost - Return the cost of the scaling used in + /// addressing mode represented by AM. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, the return value must be negative. + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const; + bool maybeLoweredToCall(Instruction &I); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -633,7 +633,7 @@ {ISD::FP_EXTEND, MVT::v2f32, 2}, {ISD::FP_EXTEND, MVT::v4f32, 4}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) return AdjustCost(LT.first * Entry->Cost); } @@ -900,7 +900,7 @@ // sometimes just be vmovs. Integer involve being passes to GPR registers, // causing more of a delay. std::pair LT = - getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); + getTypeLegalizationCost(ValTy->getScalarType()); return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } @@ -925,7 +925,7 @@ // - may require one or more conditional mov (including an IT), // - can't operate directly on immediates, // - require live flags, which we can't copy around easily. - InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; + InstructionCost Cost = getTypeLegalizationCost(ValTy).first; // Possible IT instruction for Thumb2, or more for Thumb1. ++Cost; @@ -1002,8 +1002,7 @@ return Entry->Cost; } - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); return LT.first; } @@ -1027,8 +1026,7 @@ I); } - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); int BaseCost = ST->getMVEVectorCostFactor(CostKind); // There are two types - the input that specifies the type of the compare // and the output vXi1 type. Because we don't know how the output will be @@ -1220,7 +1218,7 @@ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1241,7 +1239,7 @@ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1265,7 +1263,7 @@ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; @@ -1281,7 +1279,7 @@ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost * @@ -1289,7 +1287,7 @@ } if (!Mask.empty()) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); if (Mask.size() <= LT.second.getVectorNumElements() && (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || isVREVMask(Mask, LT.second, 64))) @@ -1325,7 +1323,7 @@ } } - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (ST->hasNEON()) { const unsigned FunctionCallDivCost = 20; @@ -1464,7 +1462,7 @@ cast(Src)->getElementType()->isDoubleTy()) { // Unaligned loads/stores are extremely inefficient. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); return LT.first * 4; } @@ -1565,7 +1563,7 @@ unsigned NumElems = VTy->getNumElements(); unsigned EltSize = VTy->getScalarSizeInBits(); - std::pair LT = TLI->getTypeLegalizationCost(DL, DataTy); + std::pair LT = getTypeLegalizationCost(DataTy); // For now, it is assumed that for the MVE gather instructions the loads are // all effectively serialised. This means the cost is the scalar cost @@ -1661,7 +1659,7 @@ if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); static const CostTblEntry CostTblAdd[]{ {ISD::ADD, MVT::v16i8, 1}, @@ -1682,8 +1680,7 @@ EVT ResVT = TLI->getValueType(DL, ResTy); if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { - std::pair LT = - TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); // The legal cases are: // VADDV u/s 8/16/32 @@ -1728,7 +1725,7 @@ break; Type *VT = ICA.getReturnType(); - std::pair LT = TLI->getTypeLegalizationCost(DL, VT); + std::pair LT = getTypeLegalizationCost(VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) { // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we @@ -1748,7 +1745,7 @@ break; Type *VT = ICA.getReturnType(); - std::pair LT = TLI->getTypeLegalizationCost(DL, VT); + std::pair LT = getTypeLegalizationCost(VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) return LT.first * ST->getMVEVectorCostFactor(CostKind); @@ -1759,7 +1756,7 @@ if (!ST->hasMVEFloatOps()) break; Type *VT = ICA.getReturnType(); - std::pair LT = TLI->getTypeLegalizationCost(DL, VT); + std::pair LT = getTypeLegalizationCost(VT); if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) return LT.first * ST->getMVEVectorCostFactor(CostKind); break; @@ -2344,3 +2341,20 @@ return false; return true; } + +InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const { + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) { + if (ST->hasFPAO()) + return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster + return 0; + } + return -1; +} diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -145,7 +145,7 @@ TTI::TargetCostKind CostKind) { if (ICA.getID() == Intrinsic::bswap) { std::pair LT = - TLI.getTypeLegalizationCost(DL, ICA.getReturnType()); + getTypeLegalizationCost(ICA.getReturnType()); return LT.first + 2; } return BaseT::getIntrinsicInstrCost(ICA, CostKind); @@ -253,7 +253,7 @@ TTI::TargetCostKind CostKind, const Instruction *I) { if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { - std::pair LT = TLI.getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); if (Opcode == Instruction::FCmp) return LT.first + FloatFactor * getTypeNumElements(ValTy); } @@ -273,7 +273,7 @@ Opd2PropInfo, Args, CxtI); if (Ty->isVectorTy()) { - std::pair LT = TLI.getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); if (LT.second.isFloatingPoint()) return LT.first + FloatFactor * getTypeNumElements(Ty); } @@ -290,10 +290,8 @@ unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; - std::pair SrcLT = - TLI.getTypeLegalizationCost(DL, SrcTy); - std::pair DstLT = - TLI.getTypeLegalizationCost(DL, DstTy); + std::pair SrcLT = getTypeLegalizationCost(SrcTy); + std::pair DstLT = getTypeLegalizationCost(DstTy); InstructionCost Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); // TODO: Allow non-throughput costs that aren't binary. diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -375,7 +375,7 @@ TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -335,8 +335,7 @@ if (U->getType()->isVectorTy()) { // Instructions that need to be split should cost more. - std::pair LT = - TLI->getTypeLegalizationCost(DL, U->getType()); + std::pair LT = getTypeLegalizationCost(U->getType()); return LT.first * BaseT::getUserCost(U, Operands, CostKind); } @@ -965,7 +964,7 @@ if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy()) return InstructionCost(1); - std::pair LT1 = TLI->getTypeLegalizationCost(DL, Ty1); + std::pair LT1 = getTypeLegalizationCost(Ty1); // If type legalization involves splitting the vector, we don't want to // double the cost at every step - only the last step. if (LT1.first != 1 || !LT1.second.isVector()) @@ -976,7 +975,7 @@ return InstructionCost(1); if (Ty2) { - std::pair LT2 = TLI->getTypeLegalizationCost(DL, Ty2); + std::pair LT2 = getTypeLegalizationCost(Ty2); if (LT2.first != 1 || !LT2.second.isVector()) return InstructionCost(1); } @@ -1018,7 +1017,7 @@ return InstructionCost::getMax(); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair LT = getTypeLegalizationCost(Tp); // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant @@ -1160,7 +1159,7 @@ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); @@ -1250,7 +1249,7 @@ "Expect a vector type for interleaved memory op"); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, VecTy); + std::pair LT = getTypeLegalizationCost(VecTy); // Firstly, the cost of load/store operation. InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), @@ -1431,8 +1430,7 @@ assert(SrcVTy && "Expected a vector type for VP memory operations"); if (hasActiveVectorLength(Opcode, Src, Alignment)) { - std::pair LT = - TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair LT = getTypeLegalizationCost(SrcVTy); InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1230,15 +1230,6 @@ bool isLegalStoreImmediate(int64_t Imm) const override; - /// Return the cost of the scaling factor used in the addressing - /// mode represented by AM for this target, for a load/store - /// of the specified type. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - InstructionCost getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS) const override; - /// This is used to enable splatted operand transforms for vector shifts /// and vector funnel shifts. bool isVectorShiftByScalarCheap(Type *Ty) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54859,35 +54859,6 @@ return Res; } -InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, - Type *Ty, - unsigned AS) const { - // Scaling factors are not free at all. - // An indexed folded instruction, i.e., inst (reg1, reg2, scale), - // will take 2 allocations in the out of order engine instead of 1 - // for plain addressing mode, i.e. inst (reg1). - // E.g., - // vaddps (%rsi,%rdx), %ymm0, %ymm1 - // Requires two allocations (one for the load, one for the computation) - // whereas: - // vaddps (%rsi), %ymm0, %ymm1 - // Requires just 1 allocation, i.e., freeing allocations for other operations - // and having less micro operations to execute. - // - // For some X86 architectures, this is even worse because for instance for - // stores, the complex addressing mode forces the instruction to use the - // "load" ports instead of the dedicated "store" port. - // E.g., on Haswell: - // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. - // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. - if (isLegalAddressingMode(DL, AM, Ty, AS)) - // Scale represents reg2 * scale, thus account for 1 - // as soon as we use a second register. - return AM.Scale != 0; - return -1; -} - bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -219,6 +219,15 @@ InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); + /// Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale, unsigned AddrSpace) const; + bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -202,7 +202,7 @@ } // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -1088,7 +1088,7 @@ VectorType *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. - std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); + std::pair LT = getTypeLegalizationCost(BaseTp); Kind = improveShuffleKindFromMask(Kind, Mask); // Treat Transpose as 2-op shuffles - there's no difference in lowering. @@ -1107,8 +1107,7 @@ int NumElts = LT.second.getVectorNumElements(); if ((Index % NumElts) == 0) return 0; - std::pair SubLT = - TLI->getTypeLegalizationCost(DL, SubTp); + std::pair SubLT = getTypeLegalizationCost(SubTp); if (SubLT.second.isVector()) { int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) @@ -1154,8 +1153,7 @@ // isn't free, because we need to preserve the rest of the wide vector. if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { int NumElts = LT.second.getVectorNumElements(); - std::pair SubLT = - TLI->getTypeLegalizationCost(DL, SubTp); + std::pair SubLT = getTypeLegalizationCost(SubTp); if (SubLT.second.isVector()) { int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) @@ -2440,9 +2438,8 @@ } // Fall back to legalized types. - std::pair LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair LTDest = - TLI->getTypeLegalizationCost(DL, Dst); + std::pair LTSrc = getTypeLegalizationCost(Src); + std::pair LTDest = getTypeLegalizationCost(Dst); if (ST->useAVX512Regs()) { if (ST->hasBWI()) @@ -2538,7 +2535,7 @@ I); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; @@ -3282,7 +3279,7 @@ if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, OpTy); + std::pair LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -3516,8 +3513,7 @@ if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair LT = - TLI->getTypeLegalizationCost(DL, RetTy); + std::pair LT = getTypeLegalizationCost(RetTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -3590,7 +3586,7 @@ if (Index != -1U && (Opcode == Instruction::ExtractElement || Opcode == Instruction::InsertElement)) { // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair LT = getTypeLegalizationCost(Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -3678,7 +3674,7 @@ // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. if (Insert) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); MVT MScalarTy = LT.second.getScalarType(); if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || @@ -3813,10 +3809,10 @@ auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); // Legalize the types. - MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; - MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second; - MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second; - MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; + MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; + MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; + MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; + MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; // They should have legalized into vector types. if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) @@ -3891,7 +3887,7 @@ CostKind); // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair LT = getTypeLegalizationCost(Src); auto *VTy = dyn_cast(Src); @@ -4054,7 +4050,7 @@ } // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair LT = getTypeLegalizationCost(SrcVTy); auto VT = TLI->getValueType(DL, SrcVTy); InstructionCost Cost = 0; if (VT.isSimple() && LT.second != VT.getSimpleVT() && @@ -4170,7 +4166,7 @@ return Entry->Cost; } - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; @@ -4358,7 +4354,7 @@ InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair LT = getTypeLegalizationCost(Ty); MVT MTy = LT.second; @@ -4488,7 +4484,7 @@ X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair LT = getTypeLegalizationCost(ValTy); MVT MTy = LT.second; @@ -4915,10 +4911,8 @@ auto *IndexVTy = FixedVectorType::get( IntegerType::get(SrcVTy->getContext(), IndexSize), VF); - std::pair IdxsLT = - TLI->getTypeLegalizationCost(DL, IndexVTy); - std::pair SrcLT = - TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair IdxsLT = getTypeLegalizationCost(IndexVTy); + std::pair SrcLT = getTypeLegalizationCost(SrcVTy); InstructionCost::CostType SplitFactor = *std::max(IdxsLT.first, SrcLT.first).getValue(); if (SplitFactor > 1) { @@ -5319,7 +5313,7 @@ // Calculate the number of memory operations (NumOfMemOps), required // for load/store the VecTy. - MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + MVT LegalVT = getTypeLegalizationCost(VecTy).second; unsigned VecTySize = DL.getTypeStoreSize(VecTy); unsigned LegalVTSize = LegalVT.getStoreSize(); unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; @@ -5399,8 +5393,7 @@ auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), VecTy->getNumElements() / Factor); InstructionCost NumOfResults = - getTLI()->getTypeLegalizationCost(DL, ResultTy).first * - NumOfLoadsInInterleaveGrp; + getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; // About a half of the loads may be folded in shuffles when we have only // one result. If we have more than one result, or the loads are masked, @@ -5498,7 +5491,7 @@ // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have // VecTy = <12 x i32>. - MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + MVT LegalVT = getTypeLegalizationCost(VecTy).second; // This function can be called with VecTy=<6xi128>, Factor=3, in which case // the VF=2, while v2i128 is an unsupported MVT vector type @@ -5776,3 +5769,37 @@ Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } + +InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const { + // Scaling factors are not free at all. + // An indexed folded instruction, i.e., inst (reg1, reg2, scale), + // will take 2 allocations in the out of order engine instead of 1 + // for plain addressing mode, i.e. inst (reg1). + // E.g., + // vaddps (%rsi,%rdx), %ymm0, %ymm1 + // Requires two allocations (one for the load, one for the computation) + // whereas: + // vaddps (%rsi), %ymm0, %ymm1 + // Requires just 1 allocation, i.e., freeing allocations for other operations + // and having less micro operations to execute. + // + // For some X86 architectures, this is even worse because for instance for + // stores, the complex addressing mode forces the instruction to use the + // "load" ports instead of the dedicated "store" port. + // E.g., on Haswell: + // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) + // Scale represents reg2 * scale, thus account for 1 + // as soon as we use a second register. + return AM.Scale != 0; + return -1; +}