Diff 260885

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 605 Lines • ▼ Show 20 Lines	public:
/// Return true if switches should be turned into lookup tables		/// Return true if switches should be turned into lookup tables
/// containing this constant value for the target.		/// containing this constant value for the target.
bool shouldBuildLookupTablesForConstant(Constant *C) const;		bool shouldBuildLookupTablesForConstant(Constant *C) const;

/// Return true if the input function which is cold at all call sites,		/// Return true if the input function which is cold at all call sites,
/// should use coldcc calling convention.		/// should use coldcc calling convention.
bool useColdCCForColdCall(Function &F) const;		bool useColdCCForColdCall(Function &F) const;

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;		/// Estimate the overhead of scalarizing an instruction. Insert and Extract
		/// are set if the demanded result elements need to be inserted and/or
		/// extracted from vectors.
		unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
		bool Insert, bool Extract) const;

		/// Estimate the overhead of scalarizing an instructions unique
		/// non-constant operands. The types of the arguments are ordinarily
		/// scalar, in which case the costs are multiplied with VF.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) const;		unsigned VF) const;

/// If target has efficient vector element load/store instructions, it can		/// If target has efficient vector element load/store instructions, it can
/// return true here so that insertion/extraction costs are not added to		/// return true here so that insertion/extraction costs are not added to
/// the scalarization cost of a load/store.		/// the scalarization cost of a load/store.
bool supportsEfficientVectorElementLoadStore() const;		bool supportsEfficientVectorElementLoadStore() const;

▲ Show 20 Lines • Show All 602 Lines • ▼ Show 20 Lines	public:
virtual bool LSRWithInstrQueries() = 0;		virtual bool LSRWithInstrQueries() = 0;
virtual bool isTruncateFree(Type Ty1, Type Ty2) = 0;		virtual bool isTruncateFree(Type Ty1, Type Ty2) = 0;
virtual bool isProfitableToHoist(Instruction *I) = 0;		virtual bool isProfitableToHoist(Instruction *I) = 0;
virtual bool useAA() = 0;		virtual bool useAA() = 0;
virtual bool isTypeLegal(Type *Ty) = 0;		virtual bool isTypeLegal(Type *Ty) = 0;
virtual bool shouldBuildLookupTables() = 0;		virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;		virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool useColdCCForColdCall(Function &F) = 0;		virtual bool useColdCCForColdCall(Function &F) = 0;
virtual unsigned getScalarizationOverhead(Type *Ty, bool Insert,		virtual unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Extract) = 0;		bool Insert, bool Extract) = 0;
virtual unsigned		virtual unsigned
getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) = 0;		unsigned VF) = 0;
virtual bool supportsEfficientVectorElementLoadStore() = 0;		virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;		virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual MemCmpExpansionOptions		virtual MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;		enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
virtual bool enableInterleavedAccessVectorization() = 0;		virtual bool enableInterleavedAccessVectorization() = 0;
▲ Show 20 Lines • Show All 307 Lines • ▼ Show 20 Lines	public:
}		}
bool shouldBuildLookupTablesForConstant(Constant *C) override {		bool shouldBuildLookupTablesForConstant(Constant *C) override {
return Impl.shouldBuildLookupTablesForConstant(C);		return Impl.shouldBuildLookupTablesForConstant(C);
}		}
bool useColdCCForColdCall(Function &F) override {		bool useColdCCForColdCall(Function &F) override {
return Impl.useColdCCForColdCall(F);		return Impl.useColdCCForColdCall(F);
}		}

unsigned getScalarizationOverhead(Type *Ty, bool Insert,		unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Extract) override {		bool Insert, bool Extract) override {
return Impl.getScalarizationOverhead(Ty, Insert, Extract);		return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}		}
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) override {		unsigned VF) override {
return Impl.getOperandsScalarizationOverhead(Args, VF);		return Impl.getOperandsScalarizationOverhead(Args, VF);
}		}

bool supportsEfficientVectorElementLoadStore() override {		bool supportsEfficientVectorElementLoadStore() override {
return Impl.supportsEfficientVectorElementLoadStore();		return Impl.supportsEfficientVectorElementLoadStore();
▲ Show 20 Lines • Show All 403 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 233 Lines • ▼ Show 20 Lines	public:

bool isTypeLegal(Type *Ty) { return false; }		bool isTypeLegal(Type *Ty) { return false; }

bool shouldBuildLookupTables() { return true; }		bool shouldBuildLookupTables() { return true; }
bool shouldBuildLookupTablesForConstant(Constant *C) { return true; }		bool shouldBuildLookupTablesForConstant(Constant *C) { return true; }

bool useColdCCForColdCall(Function &F) { return false; }		bool useColdCCForColdCall(Function &F) { return false; }

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {		unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
		bool Insert, bool Extract) {
return 0;		return 0;
}		}

unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) {		unsigned VF) {
return 0;		return 0;
}		}

▲ Show 20 Lines • Show All 654 Lines • Show Last 20 Lines

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 542 Lines • ▼ Show 20 Lines	public:
/// @}		/// @}

/// \name Vector TTI Implementations		/// \name Vector TTI Implementations
/// @{		/// @{

unsigned getRegisterBitWidth(bool Vector) const { return 32; }		unsigned getRegisterBitWidth(bool Vector) const { return 32; }

/// Estimate the overhead of scalarizing an instruction. Insert and Extract		/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the result needs to be inserted and/or extracted from vectors.		/// are set if the demanded result elements need to be inserted and/or
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {		/// extracted from vectors.
		unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
		bool Insert, bool Extract) {
auto *VTy = cast<VectorType>(Ty);		auto *VTy = cast<VectorType>(Ty);
		assert(DemandedElts.getBitWidth() == VTy->getNumElements() &&
		"Vector size mismatch");

unsigned Cost = 0;		unsigned Cost = 0;

for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {		for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
		if (!DemandedElts[i])
		continue;
if (Insert)		if (Insert)
Cost += static_cast<T *>(this)->getVectorInstrCost(		Cost += static_cast<T *>(this)->getVectorInstrCost(
Instruction::InsertElement, VTy, i);		Instruction::InsertElement, VTy, i);
if (Extract)		if (Extract)
Cost += static_cast<T *>(this)->getVectorInstrCost(		Cost += static_cast<T *>(this)->getVectorInstrCost(
Instruction::ExtractElement, VTy, i);		Instruction::ExtractElement, VTy, i);
}		}

return Cost;		return Cost;
}		}

		/// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
		unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
		auto *VTy = cast<VectorType>(Ty);
		APInt DemandedElts = APInt::getAllOnesValue(VTy->getNumElements());
		return static_cast<T *>(this)->getScalarizationOverhead(Ty, DemandedElts,
		Insert, Extract);
		}

/// Estimate the overhead of scalarizing an instructions unique		/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The types of the arguments are ordinarily		/// non-constant operands. The types of the arguments are ordinarily
/// scalar, in which case the costs are multiplied with VF.		/// scalar, in which case the costs are multiplied with VF.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,		unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) {		unsigned VF) {
unsigned Cost = 0;		unsigned Cost = 0;
SmallPtrSet<const Value*, 4> UniqueOperands;		SmallPtrSet<const Value*, 4> UniqueOperands;
for (const Value *A : Args) {		for (const Value *A : Args) {
▲ Show 20 Lines • Show All 1,183 Lines • Show Last 20 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 362 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::shouldBuildLookupTablesForConstant(
Constant *C) const {		Constant *C) const {
return TTIImpl->shouldBuildLookupTablesForConstant(C);		return TTIImpl->shouldBuildLookupTablesForConstant(C);
}		}

bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {		bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
return TTIImpl->useColdCCForColdCall(F);		return TTIImpl->useColdCCForColdCall(F);
}		}

unsigned TargetTransformInfo::getScalarizationOverhead(Type *Ty, bool Insert,		unsigned TargetTransformInfo::getScalarizationOverhead(
bool Extract) const {		Type *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const {
return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);		return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}		}

unsigned TargetTransformInfo::getOperandsScalarizationOverhead(		unsigned TargetTransformInfo::getOperandsScalarizationOverhead(
ArrayRef<const Value *> Args, unsigned VF) const {		ArrayRef<const Value *> Args, unsigned VF) const {
return TTIImpl->getOperandsScalarizationOverhead(Args, VF);		return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
}		}

bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {		bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
▲ Show 20 Lines • Show All 1,013 Lines • Show Last 20 Lines

llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h

Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines	public:
}		}
bool prefersVectorizedAddressing() {		bool prefersVectorizedAddressing() {
return false;		return false;
}		}
bool enableInterleavedAccessVectorization() {		bool enableInterleavedAccessVectorization() {
return true;		return true;
}		}

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);		unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
		bool Insert, bool Extract);
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,		unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF);		unsigned VF);
unsigned getCallInstrCost(Function F, Type RetTy, ArrayRef<Type*> Tys);		unsigned getCallInstrCost(Function F, Type RetTy, ArrayRef<Type*> Tys);
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF,		ArrayRef<Value *> Args, FastMathFlags FMF,
unsigned VF, const Instruction *I);		unsigned VF, const Instruction *I);
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,		unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF,		ArrayRef<Type *> Tys, FastMathFlags FMF,
unsigned ScalarizationCostPassed = UINT_MAX,		unsigned ScalarizationCostPassed = UINT_MAX,
const Instruction *I = nullptr);		const Instruction *I = nullptr);
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp

	Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
	unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {			unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
	return useHVX() ? ST.getVectorLength()*8 : 0;			return useHVX() ? ST.getVectorLength()*8 : 0;
	}			}

	unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {			unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
	return (8 * ST.getVectorLength()) / ElemWidth;			return (8 * ST.getVectorLength()) / ElemWidth;
	}			}

	unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,			unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty,
	bool Extract) {			const APInt &DemandedElts,
	return BaseT::getScalarizationOverhead(Ty, Insert, Extract);			bool Insert, bool Extract) {
				return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
	}			}

	unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(			unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
	ArrayRef<const Value*> Args, unsigned VF) {			ArrayRef<const Value*> Args, unsigned VF) {
	return BaseT::getOperandsScalarizationOverhead(Args, VF);			return BaseT::getOperandsScalarizationOverhead(Args, VF);
	}			}

	unsigned HexagonTTIImpl::getCallInstrCost(Function F, Type RetTy,			unsigned HexagonTTIImpl::getCallInstrCost(Function F, Type RetTy,
	▲ Show 20 Lines • Show All 203 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86TargetTransformInfo.h

Show First 20 Lines • Show All 126 Lines • ▼ Show 20 Lines	int getArithmeticInstrCost(
const Instruction *CxtI = nullptr);		const Instruction *CxtI = nullptr);
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,		int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
VectorType *SubTp);		VectorType *SubTp);
int getCastInstrCost(unsigned Opcode, Type Dst, Type Src,		int getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
const Instruction *I = nullptr);		const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,		int getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,
const Instruction *I = nullptr);		const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);		int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);		unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
		bool Insert, bool Extract);
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,		int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);		unsigned AddressSpace, const Instruction *I = nullptr);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,		int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);		unsigned AddressSpace);
int getGatherScatterOpCost(unsigned Opcode, Type DataTy, Value Ptr,		int getGatherScatterOpCost(unsigned Opcode, Type DataTy, Value Ptr,
bool VariableMask, unsigned Alignment,		bool VariableMask, unsigned Alignment,
const Instruction *I);		const Instruction *I);
int getAddressComputationCost(Type PtrTy, ScalarEvolution SE,		int getAddressComputationCost(Type PtrTy, ScalarEvolution SE,
▲ Show 20 Lines • Show All 79 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 2,862 Lines • ▼ Show 20 Lines	int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
// Add to the base cost if we know that the extracted element of a vector is		// Add to the base cost if we know that the extracted element of a vector is
// destined to be moved to and used in the integer register file.		// destined to be moved to and used in the integer register file.
if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())		if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
RegisterFileMoveCost += 1;		RegisterFileMoveCost += 1;

return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;		return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}		}

unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,		unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
bool Extract) {		const APInt &DemandedElts,
return BaseT::getScalarizationOverhead(Ty, Insert, Extract);		bool Insert, bool Extract) {
		auto* VecTy = cast<VectorType>(Ty);
		unsigned Cost = 0;

		// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
		// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
		if (Insert) {
		std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
		MVT MScalarTy = LT.second.getScalarType();

		if ((MScalarTy == MVT::i16 && ST->hasSSE2()) \|\|
		(MScalarTy.isInteger() && ST->hasSSE41()) \|\|
		(MScalarTy == MVT::f32 && ST->hasSSE41())) {
		// For types we can insert directly, insertion into 128-bit sub vectors is
		// cheap, followed by a cheap chain of concatenations.
		if (LT.second.getSizeInBits() <= 128) {
		craig.topperUnsubmitted Not Done Reply Inline Actions Doesn't this over count the cost of inserting multiple elements into the upper half of a 256 bit vector? We'll cost a subvector insert for each element, but we should be able to share. craig.topper: Doesn't this over count the cost of inserting multiple elements into the upper half of a 256…
		Cost +=
		BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
		} else {
		unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
		Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
		Cost += DemandedElts.countPopulation();
		craig.topperUnsubmitted Not Done Reply Inline Actions What about the 0 cost for fp insertion into element 0 of each subvector? craig.topper: What about the 0 cost for fp insertion into element 0 of each subvector?
		RKSimonAuthorUnsubmitted Done Reply Inline Actions Nice catch - I'll update the patch tomorrow RKSimon: Nice catch - I'll update the patch tomorrow

		// For vXf32 cases, insertion into the 0'th index in each v4f32
		// 128-bit vector is free.
		// NOTE: This assumes legalization widens vXf32 vectors.
		if (MScalarTy == MVT::f32)
		for (unsigned i = 0, e = VecTy->getNumElements(); i < e; i += 4)
		if (DemandedElts[i])
		Cost--;
		}
		} else if (LT.second.isVector()) {
		// Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
		// integer element as a SCALAR_TO_VECTOR, then we build the vector as a
		// series of UNPCK followed by CONCAT_VECTORS - all of these can be
		// considered cheap.
		if (Ty->isIntOrIntVectorTy())
		Cost += DemandedElts.countPopulation();

		// Get the smaller of the legalized or original pow2-extended number of
		// vector elements, which represents the number of unpacks we'll end up
		// performing.
		unsigned NumElts = LT.second.getVectorNumElements();
		unsigned Pow2Elts = PowerOf2Ceil(VecTy->getNumElements());
		Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
		}
		}

		// TODO: Use default extraction for now, but we should investigate extending this
		// to handle repeated subvector extraction.
		craig.topperUnsubmitted Not Done Reply Inline Actions Should this be a marked a proper FIXME/TODO? craig.topper: Should this be a marked a proper FIXME/TODO?
		if (Extract)
		Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);

		return Cost;
}		}

int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,		int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,		MaybeAlign Alignment, unsigned AddressSpace,
const Instruction *I) {		const Instruction *I) {
// Handle non-power-of-two vectors such as <3 x float>		// Handle non-power-of-two vectors such as <3 x float>
if (VectorType *VTy = dyn_cast<VectorType>(Src)) {		if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
unsigned NumElem = VTy->getNumElements();		unsigned NumElem = VTy->getNumElements();

// Handle a few common cases:		// Handle a few common cases:
// <3 x float>		// <3 x float>
if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)		if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
// Cost = 64 bit store + extract + 32 bit store.		// Cost = 64 bit store + extract + 32 bit store.
return 3;		return 3;

// <3 x double>		// <3 x double>
if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)		if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
// Cost = 128 bit store + unpack + 64 bit store.		// Cost = 128 bit store + unpack + 64 bit store.
return 3;		return 3;

// Assume that all other non-power-of-two numbers are scalarized.		// Assume that all other non-power-of-two numbers are scalarized.
if (!isPowerOf2_32(NumElem)) {		if (!isPowerOf2_32(NumElem)) {
		APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,		int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
AddressSpace);		AddressSpace);
int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,		int SplitCost = getScalarizationOverhead(Src, DemandedElts,
		Opcode == Instruction::Load,
Opcode == Instruction::Store);		Opcode == Instruction::Store);
return NumElem * Cost + SplitCost;		return NumElem * Cost + SplitCost;
}		}
}		}

// Legalize the type.		// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);		std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&		assert((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
Show All 23 Lines	int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,

unsigned NumElem = SrcVTy->getNumElements();		unsigned NumElem = SrcVTy->getNumElements();
VectorType *MaskTy =		VectorType *MaskTy =
VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);		VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) \|\|		if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) \|\|
(IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) \|\|		(IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) \|\|
!isPowerOf2_32(NumElem)) {		!isPowerOf2_32(NumElem)) {
// Scalarization		// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);		APInt DemandedElts = APInt::getAllOnesValue(NumElem);
		int MaskSplitCost =
		getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(		int ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);		Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
int BranchCost = getCFInstrCost(Instruction::Br);		int BranchCost = getCFInstrCost(Instruction::Br);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);		int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
		int ValueSplitCost =
int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);		getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
int MemopCost =		int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),		NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
MaybeAlign(Alignment), AddressSpace);		MaybeAlign(Alignment), AddressSpace);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;		return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}		}

// Legalize the type.		// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);		std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
▲ Show 20 Lines • Show All 837 Lines • ▼ Show 20 Lines
/// VariableMask - The mask is non-constant at compile time.		/// VariableMask - The mask is non-constant at compile time.
/// Alignment - Alignment for one element.		/// Alignment - Alignment for one element.
/// AddressSpace - pointer[s] address space.		/// AddressSpace - pointer[s] address space.
///		///
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,		int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
bool VariableMask, unsigned Alignment,		bool VariableMask, unsigned Alignment,
unsigned AddressSpace) {		unsigned AddressSpace) {
unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();		unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
		APInt DemandedElts = APInt::getAllOnesValue(VF);

int MaskUnpackCost = 0;		int MaskUnpackCost = 0;
if (VariableMask) {		if (VariableMask) {
VectorType *MaskTy =		VectorType *MaskTy =
VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);		VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);		MaskUnpackCost =
		getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost =		int ScalarCompareCost =
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),		getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
nullptr);		nullptr);
int BranchCost = getCFInstrCost(Instruction::Br);		int BranchCost = getCFInstrCost(Instruction::Br);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);		MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}		}

// The cost of the scalar loads/stores.		// The cost of the scalar loads/stores.
▲ Show 20 Lines • Show All 572 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,697 Lines • ▼ Show 20 Lines	while (!Worklist.empty()) {
// the instruction as if it wasn't if-converted and instead remained in the		// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after		// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.		// computing the scalarization overhead.
unsigned ScalarCost = VF * getInstructionCost(I, 1).first;		unsigned ScalarCost = VF * getInstructionCost(I, 1).first;

// Compute the scalarization overhead of needed insertelement instructions		// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.		// and phi nodes.
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {		if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),		ScalarCost +=
true, false);		TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
		APInt::getAllOnesValue(VF), true, false);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);		ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
}		}

// Compute the scalarization overhead of needed extractelement		// Compute the scalarization overhead of needed extractelement
// instructions. For each of the instruction's operands, if the operand can		// instructions. For each of the instruction's operands, if the operand can
// be scalarized, add it to the worklist; otherwise, account for the		// be scalarized, add it to the worklist; otherwise, account for the
// overhead.		// overhead.
for (Use &U : I->operands())		for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get())) {		if (auto *J = dyn_cast<Instruction>(U.get())) {
assert(VectorType::isValidElementType(J->getType()) &&		assert(VectorType::isValidElementType(J->getType()) &&
"Instruction has non-scalar type");		"Instruction has non-scalar type");
if (canBeScalarized(J))		if (canBeScalarized(J))
Worklist.push_back(J);		Worklist.push_back(J);
else if (needsExtract(J, VF))		else if (needsExtract(J, VF))
ScalarCost += TTI.getScalarizationOverhead(		ScalarCost += TTI.getScalarizationOverhead(
ToVectorTy(J->getType(),VF), false, true);		ToVectorTy(J->getType(), VF), APInt::getAllOnesValue(VF), false,
		true);
}		}

// Scale the total scalar cost by block probability.		// Scale the total scalar cost by block probability.
ScalarCost /= getReciprocalPredBlockProb();		ScalarCost /= getReciprocalPredBlockProb();

// Compute the discount. A non-negative discount means the vector version		// Compute the discount. A non-negative discount means the vector version
// of the instruction costs more, and scalarizing would be beneficial.		// of the instruction costs more, and scalarizing would be beneficial.
Discount += VectorCost - ScalarCost;		Discount += VectorCost - ScalarCost;
▲ Show 20 Lines • Show All 267 Lines • ▼ Show 20 Lines	unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

if (VF == 1)		if (VF == 1)
return 0;		return 0;

unsigned Cost = 0;		unsigned Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);		Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&		if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) \|\| !TTI.supportsEfficientVectorElementLoadStore()))		(!isa<LoadInst>(I) \|\| !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(RetTy, true, false);		Cost += TTI.getScalarizationOverhead(RetTy, APInt::getAllOnesValue(VF),
		true, false);

// Some targets keep addresses scalar.		// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())		if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
return Cost;		return Cost;

// Some targets support efficient element stores.		// Some targets support efficient element stores.
if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())		if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
return Cost;		return Cost;
▲ Show 20 Lines • Show All 189 Lines • ▼ Show 20 Lines	if (VF > 1 && BI->isConditional() &&
PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=		PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
PredicatedBBsAfterVectorization.end()))		PredicatedBBsAfterVectorization.end()))
ScalarPredicatedBB = true;		ScalarPredicatedBB = true;

if (ScalarPredicatedBB) {		if (ScalarPredicatedBB) {
// Return cost for branches around scalarized and predicated blocks.		// Return cost for branches around scalarized and predicated blocks.
Type *Vec_i1Ty =		Type *Vec_i1Ty =
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);		VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +		return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
		false, true) +
(TTI.getCFInstrCost(Instruction::Br) * VF));		(TTI.getCFInstrCost(Instruction::Br) * VF));
} else if (I->getParent() == TheLoop->getLoopLatch() \|\| VF == 1)		} else if (I->getParent() == TheLoop->getLoopLatch() \|\| VF == 1)
// The back-edge branch will remain, as will all scalar branches.		// The back-edge branch will remain, as will all scalar branches.
return TTI.getCFInstrCost(Instruction::Br);		return TTI.getCFInstrCost(Instruction::Br);
else		else
// This branch will be eliminated by if-conversion.		// This branch will be eliminated by if-conversion.
return 0;		return 0;
// Note: We currently assume zero cost for an unconditional branch inside		// Note: We currently assume zero cost for an unconditional branch inside
▲ Show 20 Lines • Show All 1,844 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,871 Lines • ▼ Show 20 Lines	int BoUpSLP::getTreeCost() {
if (ViewSLPTree)		if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);		ViewGraph(this, "SLP" + F->getName(), false, Str);

return Cost;		return Cost;
}		}

int BoUpSLP::getGatherCost(VectorType *Ty,		int BoUpSLP::getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {		const DenseSet<unsigned> &ShuffledIndices) const {
int Cost = 0;		unsigned NumElts = Ty->getNumElements();
for (unsigned i = 0, e = Ty->getNumElements(); i < e; ++i)		APInt DemandedElts = APInt::getNullValue(NumElts);
		for (unsigned i = 0; i < NumElts; ++i)
if (!ShuffledIndices.count(i))		if (!ShuffledIndices.count(i))
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);		DemandedElts.setBit(i);
		int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /Insert/ true,
		craig.topperUnsubmitted Not Done Reply Inline Actions Can we annotate the bools with // comments in all the call sites to make it readable? craig.topper: Can we annotate the bools with // comments in all the call sites to make it readable?
		/Extract/ false);
if (!ShuffledIndices.empty())		if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);		Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;		return Cost;
}		}

int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {		int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the type of the operands in VL.		// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();		Type *ScalarTy = VL[0]->getType();
▲ Show 20 Lines • Show All 3,121 Lines • ▼ Show 20 Lines
static bool findBuildAggregate(Value LastInsertInst, TargetTransformInfo TTI,		static bool findBuildAggregate(Value LastInsertInst, TargetTransformInfo TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,		SmallVectorImpl<Value *> &BuildVectorOpds,
int &UserCost) {		int &UserCost) {
assert((isa<InsertElementInst>(LastInsertInst) \|\|		assert((isa<InsertElementInst>(LastInsertInst) \|\|
isa<InsertValueInst>(LastInsertInst)) &&		isa<InsertValueInst>(LastInsertInst)) &&
"Expected insertelement or insertvalue instruction!");		"Expected insertelement or insertvalue instruction!");
UserCost = 0;		UserCost = 0;
do {		do {
		// TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather
		vdmitrieUnsubmitted Not Done Reply Inline Actions This sounds like a simple machinery task while it is not. I was trying to explain the underlying issue in D76956. The problem is that we have a bias in cost model toward favoring vectorization for longer vectors and if we fix just this place the bias will be much more severe. Since this patch effectively replaces 76956 I'm perfectly fine in a case if you decide to borrow/adopt my comment. vdmitrie: This sounds like a simple machinery task while it is not. I was trying to explain the…
		// than sum of single inserts as the latter may overestimate cost.
		// This work should imply improving cost estimation for extracts that
		// added in for external (for vectorization tree) users.
		// For example, in following case all extracts added in order to feed
		// into external users (inserts), which in turn form sequence to build
		// an aggregate that we do match here:
		// %4 = extractelement <4 x i64> %3, i32 0
		// %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
		// %5 = extractelement <4 x i64> %3, i32 1
		// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
		// %6 = extractelement <4 x i64> %3, i32 2
		// %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
		// %7 = extractelement <4 x i64> %3, i32 3
		// %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
		//
		// Cost of this entire sequence is currently estimated as sum of single
		// extracts (as this aggregate build sequence is an external to
		// vectorization tree user) minus cost of the aggregate build.
		// As this whole sequence will be optimized away we want the cost to be
		// zero. But it is not quite possible using given approach (at least for
		// X86) because inserts can be more expensive than extracts for longer
		// vector lengths so the difference turns out not zero in such a case.
		// Ideally we want to match this entire sequence and treat it as a no-op
		// (i.e. do not count into final cost at all).
		// Currently the difference tends to be negative thus adding a bias
		// toward favoring vectorization. If we switch into using TTI interface
		// the bias tendency will remain but will be lower.
Value *InsertedOperand;		Value *InsertedOperand;
if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {		if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
InsertedOperand = IE->getOperand(1);		InsertedOperand = IE->getOperand(1);
LastInsertInst = IE->getOperand(0);		LastInsertInst = IE->getOperand(0);
if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {		if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,		UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
IE->getType(), CI->getZExtValue());		IE->getType(), CI->getZExtValue());
}		}
▲ Show 20 Lines • Show All 505 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/arith-fp.ll

Show First 20 Lines • Show All 667 Lines • ▼ Show 20 Lines	;
%V8F64 = fdiv <8 x double> undef, undef		%V8F64 = fdiv <8 x double> undef, undef

ret i32 undef		ret i32 undef
}		}

define i32 @frem(i32 %arg) {		define i32 @frem(i32 %arg) {
; SSE1-LABEL: 'frem'		; SSE1-LABEL: 'frem'
; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = frem <8 x double> undef, undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = frem <8 x double> undef, undef
; SSE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE2-LABEL: 'frem'		; SSE2-LABEL: 'frem'
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'frem'		; SSE42-LABEL: 'frem'
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX-LABEL: 'frem'		; AVX-LABEL: 'frem'
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F32 = frem <8 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16F32 = frem <16 x float> undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16F32 = frem <16 x float> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef		; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512-LABEL: 'frem'		; AVX512-LABEL: 'frem'
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F32 = frem <8 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16F32 = frem <16 x float> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = frem <8 x double> undef, undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = frem <8 x double> undef, undef
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SLM-LABEL: 'frem'		; SLM-LABEL: 'frem'
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef		; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef		; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef		; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef		; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef		; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
▲ Show 20 Lines • Show All 319 Lines • ▼ Show 20 Lines	;
%V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)		%V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)

ret i32 undef		ret i32 undef
}		}

define i32 @fma(i32 %arg) {		define i32 @fma(i32 %arg) {
; SSE1-LABEL: 'fma'		; SSE1-LABEL: 'fma'
; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)		; SSE1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
; SSE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE2-LABEL: 'fma'		; SSE2-LABEL: 'fma'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fma'		; SSE42-LABEL: 'fma'
; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX-LABEL: 'fma'		; AVX-LABEL: 'fma'
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512-LABEL: 'fma'		; AVX512-LABEL: 'fma'
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
▲ Show 20 Lines • Show All 79 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/fptosi.ll

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
;		;
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm \| FileCheck %s --check-prefixes=CHECK,SSE,SLM		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm \| FileCheck %s --check-prefixes=CHECK,SSE,SLM
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 \| FileCheck %s --check-prefixes=CHECK,AVX,BTVER2		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 \| FileCheck %s --check-prefixes=CHECK,AVX,BTVER2

define i32 @fptosi_double_i64(i32 %arg) {		define i32 @fptosi_double_i64(i32 %arg) {
; SSE2-LABEL: 'fptosi_double_i64'		; SSE2-LABEL: 'fptosi_double_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fptosi_double_i64'		; SSE42-LABEL: 'fptosi_double_i64'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX-LABEL: 'fptosi_double_i64'		; AVX-LABEL: 'fptosi_double_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64		; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512F-LABEL: 'fptosi_double_i64'		; AVX512F-LABEL: 'fptosi_double_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64		; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512DQ-LABEL: 'fptosi_double_i64'		; AVX512DQ-LABEL: 'fptosi_double_i64'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	;
%V4I8 = fptosi <4 x double> undef to <4 x i8>		%V4I8 = fptosi <4 x double> undef to <4 x i8>
%V8I8 = fptosi <8 x double> undef to <8 x i8>		%V8I8 = fptosi <8 x double> undef to <8 x i8>
ret i32 undef		ret i32 undef
}		}

define i32 @fptosi_float_i64(i32 %arg) {		define i32 @fptosi_float_i64(i32 %arg) {
; SSE2-LABEL: 'fptosi_float_i64'		; SSE2-LABEL: 'fptosi_float_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fptosi_float_i64'		; SSE42-LABEL: 'fptosi_float_i64'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX-LABEL: 'fptosi_float_i64'		; AVX-LABEL: 'fptosi_float_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64		; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512F-LABEL: 'fptosi_float_i64'		; AVX512F-LABEL: 'fptosi_float_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64		; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512DQ-LABEL: 'fptosi_float_i64'		; AVX512DQ-LABEL: 'fptosi_float_i64'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
▲ Show 20 Lines • Show All 132 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/fptoui.ll

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq \| FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
;		;
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm \| FileCheck %s --check-prefixes=CHECK,SSE,SLM		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm \| FileCheck %s --check-prefixes=CHECK,SSE,SLM
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42
; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 \| FileCheck %s --check-prefixes=CHECK,AVX,BTVER2		; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 \| FileCheck %s --check-prefixes=CHECK,AVX,BTVER2

define i32 @fptoui_double_i64(i32 %arg) {		define i32 @fptoui_double_i64(i32 %arg) {
; SSE2-LABEL: 'fptoui_double_i64'		; SSE2-LABEL: 'fptoui_double_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64		; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fptoui_double_i64'		; SSE42-LABEL: 'fptoui_double_i64'
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64		; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX-LABEL: 'fptoui_double_i64'		; AVX-LABEL: 'fptoui_double_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64		; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512F-LABEL: 'fptoui_double_i64'		; AVX512F-LABEL: 'fptoui_double_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64		; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512DQ-LABEL: 'fptoui_double_i64'		; AVX512DQ-LABEL: 'fptoui_double_i64'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
Show All 10 Lines	;
%V4I64 = fptoui <4 x double> undef to <4 x i64>		%V4I64 = fptoui <4 x double> undef to <4 x i64>
%V8I64 = fptoui <8 x double> undef to <8 x i64>		%V8I64 = fptoui <8 x double> undef to <8 x i64>
ret i32 undef		ret i32 undef
}		}

define i32 @fptoui_double_i32(i32 %arg) {		define i32 @fptoui_double_i32(i32 %arg) {
; SSE2-LABEL: 'fptoui_double_i32'		; SSE2-LABEL: 'fptoui_double_i32'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fptoui_double_i32'		; SSE42-LABEL: 'fptoui_double_i32'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
%V4I8 = fptoui <4 x double> undef to <4 x i8>		%V4I8 = fptoui <4 x double> undef to <4 x i8>
%V8I8 = fptoui <8 x double> undef to <8 x i8>		%V8I8 = fptoui <8 x double> undef to <8 x i8>
ret i32 undef		ret i32 undef
}		}

define i32 @fptoui_float_i64(i32 %arg) {		define i32 @fptoui_float_i64(i32 %arg) {
; SSE2-LABEL: 'fptoui_float_i64'		; SSE2-LABEL: 'fptoui_float_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64		; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>		; SSE2-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fptoui_float_i64'		; SSE42-LABEL: 'fptoui_float_i64'
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64		; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>		; SSE42-NEXT: Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX-LABEL: 'fptoui_float_i64'		; AVX-LABEL: 'fptoui_float_i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64		; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>		; AVX-NEXT: Cost Model: Found an estimated cost of 111 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512F-LABEL: 'fptoui_float_i64'		; AVX512F-LABEL: 'fptoui_float_i64'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64		; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>		; AVX512F-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; AVX512DQ-LABEL: 'fptoui_float_i64'		; AVX512DQ-LABEL: 'fptoui_float_i64'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>		; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
Show All 13 Lines	;
%V8I64 = fptoui <8 x float> undef to <8 x i64>		%V8I64 = fptoui <8 x float> undef to <8 x i64>
%V16I64 = fptoui <16 x float> undef to <16 x i64>		%V16I64 = fptoui <16 x float> undef to <16 x i64>
ret i32 undef		ret i32 undef
}		}

define i32 @fptoui_float_i32(i32 %arg) {		define i32 @fptoui_float_i32(i32 %arg) {
; SSE2-LABEL: 'fptoui_float_i32'		; SSE2-LABEL: 'fptoui_float_i32'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>		; SSE2-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'fptoui_float_i32'		; SSE42-LABEL: 'fptoui_float_i32'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>		; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
▲ Show 20 Lines • Show All 131 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/fround.ll

Show All 10 Lines
; RUN: opt < %s -enable-no-nans-fp-math -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 \| FileCheck %s --check-prefixes=CHECK,AVX,BTVER2		; RUN: opt < %s -enable-no-nans-fp-math -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 \| FileCheck %s --check-prefixes=CHECK,AVX,BTVER2

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"		target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"		target triple = "x86_64-apple-macosx10.8.0"

define i32 @ceil(i32 %arg) {		define i32 @ceil(i32 %arg) {
; SSE2-LABEL: 'ceil'		; SSE2-LABEL: 'ceil'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.ceil.f32(float undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.ceil.f32(float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.ceil.f64(double undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.ceil.f64(double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.ceil.v2f64(<2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.ceil.v2f64(<2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.ceil.v4f64(<4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.ceil.v4f64(<4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.ceil.v8f64(<8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.ceil.v8f64(<8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'ceil'		; SSE42-LABEL: 'ceil'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.ceil.f32(float undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.ceil.f32(float undef)
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	;
%V8F64 = call <8 x double> @llvm.ceil.v8f64(<8 x double> undef)		%V8F64 = call <8 x double> @llvm.ceil.v8f64(<8 x double> undef)

ret i32 undef		ret i32 undef
}		}

define i32 @floor(i32 %arg) {		define i32 @floor(i32 %arg) {
; SSE2-LABEL: 'floor'		; SSE2-LABEL: 'floor'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.floor.f32(float undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.floor.f32(float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.floor.f64(double undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.floor.f64(double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.floor.v2f64(<2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.floor.v2f64(<2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.floor.v4f64(<4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.floor.v4f64(<4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.floor.v8f64(<8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.floor.v8f64(<8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'floor'		; SSE42-LABEL: 'floor'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.floor.f32(float undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.floor.f32(float undef)
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	;
%V8F64 = call <8 x double> @llvm.floor.v8f64(<8 x double> undef)		%V8F64 = call <8 x double> @llvm.floor.v8f64(<8 x double> undef)

ret i32 undef		ret i32 undef
}		}

define i32 @nearbyint(i32 %arg) {		define i32 @nearbyint(i32 %arg) {
; SSE2-LABEL: 'nearbyint'		; SSE2-LABEL: 'nearbyint'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.nearbyint.f64(double undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.nearbyint.f64(double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'nearbyint'		; SSE42-LABEL: 'nearbyint'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef)
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	;
%V8F64 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> undef)		%V8F64 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> undef)

ret i32 undef		ret i32 undef
}		}

define i32 @rint(i32 %arg) {		define i32 @rint(i32 %arg) {
; SSE2-LABEL: 'rint'		; SSE2-LABEL: 'rint'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'rint'		; SSE42-LABEL: 'rint'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	;
%V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)		%V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)

ret i32 undef		ret i32 undef
}		}

define i32 @trunc(i32 %arg) {		define i32 @trunc(i32 %arg) {
; SSE2-LABEL: 'trunc'		; SSE2-LABEL: 'trunc'
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.trunc.f32(float undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.trunc.f32(float undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.trunc.f64(double undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.trunc.f64(double undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.trunc.v2f64(<2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.trunc.v2f64(<2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.trunc.v4f64(<4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.trunc.v4f64(<4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.trunc.v8f64(<8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.trunc.v8f64(<8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;		;
; SSE42-LABEL: 'trunc'		; SSE42-LABEL: 'trunc'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.trunc.f32(float undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.trunc.f32(float undef)
▲ Show 20 Lines • Show All 115 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll

Show All 16 Lines	vector.body: ; preds = %vector.body, %vector.ph
%index.next = add i64 %index, 4		%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024		%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body		br i1 %3, label %for.end, label %vector.body

for.end: ; preds = %vector.body		for.end: ; preds = %vector.body
ret void		ret void

; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1':		; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1':
; CORE2: Cost Model: Found an estimated cost of 49 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)		; CORE2: Cost Model: Found an estimated cost of 46 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)

; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1':		; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1':
; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)		; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)

}		}

declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone		declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone

Show All 11 Lines	vector.body: ; preds = %vector.body, %vector.ph
%index.next = add i64 %index, 4		%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024		%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body		br i1 %3, label %for.end, label %vector.body

for.end: ; preds = %vector.body		for.end: ; preds = %vector.body
ret void		ret void

; CORE2: Printing analysis 'Cost Model Analysis' for function 'test2':		; CORE2: Printing analysis 'Cost Model Analysis' for function 'test2':
; CORE2: Cost Model: Found an estimated cost of 49 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)		; CORE2: Cost Model: Found an estimated cost of 46 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)

; COREI7: Printing analysis 'Cost Model Analysis' for function 'test2':		; COREI7: Printing analysis 'Cost Model Analysis' for function 'test2':
; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)		; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)

}		}

declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone		declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone

Show All 27 Lines

llvm/test/Analysis/CostModel/X86/load_store.ll

	Show First 20 Lines • Show All 103 Lines • ▼ Show 20 Lines
	; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4			; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4
	; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX-LABEL: 'loads'			; AVX-LABEL: 'loads'
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, i8* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, i8* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, i16* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, i16* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, i32* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, i32* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, i64* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, i64* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load i128, i128* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load i128, i128* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i32>, <2 x i32>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i32>, <2 x i32>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <4 x i32>, <4 x i32>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <4 x i32>, <4 x i32>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4			; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4
	; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512-LABEL: 'loads'			; AVX512-LABEL: 'loads'
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, i8* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, i8* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, i16* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, i16* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, i32* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, i32* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, i64* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, i64* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load i128, i128* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load i128, i128* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i32>, <2 x i32>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i32>, <2 x i32>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <4 x i32>, <4 x i32>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <4 x i32>, <4 x i32>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i32>, <8 x i32>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i64>, <2 x i64>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <4 x i64>, <4 x i64>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <8 x i64>, <8 x i64>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = load <3 x float>, <3 x float>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = load <3 x double>, <3 x double>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = load <3 x i32>, <3 x i32>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = load <3 x i64>, <3 x i64>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %16 = load <5 x i32>, <5 x i32>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4			; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %17 = load <5 x i64>, <5 x i64>* undef, align 4
	; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	load i8, i8* undef, align 4			load i8, i8* undef, align 4
	load i16, i16* undef, align 4			load i16, i16* undef, align 4
	load i32, i32* undef, align 4			load i32, i32* undef, align 4
	load i64, i64* undef, align 4			load i64, i64* undef, align 4
	load i128, i128* undef, align 4			load i128, i128* undef, align 4

	Show All 18 Lines

llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,SSE,SSE42
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2
;		;
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX,SKL		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX,SKL
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX512,KNL		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX512,KNL
; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX512,SKX		; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze \| FileCheck %s --check-prefixes=CHECK,AVX512,SKX

define i32 @masked_load() {		define i32 @masked_load() {
; SSE2-LABEL: 'masked_load'		; SSE2-LABEL: 'masked_load'
; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 111 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 1148 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 574 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 287 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
; SSE42-LABEL: 'masked_load'		; SSE42-LABEL: 'masked_load'
; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
Show All 30 Lines
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 290 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
; KNL-LABEL: 'masked_load'		; KNL-LABEL: 'masked_load'
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 147 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 307 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)		; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
; SKX-LABEL: 'masked_load'		; SKX-LABEL: 'masked_load'
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)		; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)		; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)		; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
▲ Show 20 Lines • Show All 615 Lines • ▼ Show 20 Lines
}		}

define i32 @masked_expandload() {		define i32 @masked_expandload() {
; SSE2-LABEL: 'masked_expandload'		; SSE2-LABEL: 'masked_expandload'
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)		; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
; SSE42-LABEL: 'masked_expandload'		; SSE42-LABEL: 'masked_expandload'
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
Show All 14 Lines
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)		; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
; AVX-LABEL: 'masked_expandload'		; AVX-LABEL: 'masked_expandload'
; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)		; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
; AVX512-LABEL: 'masked_expandload'		; AVX512-LABEL: 'masked_expandload'
; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)		; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0		; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;		;
%V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)		%V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
%V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)		%V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
%V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)		%V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
%V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)		%V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
▲ Show 20 Lines • Show All 192 Lines • ▼ Show 20 Lines	;
%mask = icmp eq <2 x i64> %trigger, zeroinitializer		%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)		%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
ret <2 x double> %res		ret <2 x double> %res
}		}

define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {		define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; SSE2-LABEL: 'test2'		; SSE2-LABEL: 'test2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)		; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
;		;
; SSE42-LABEL: 'test2'		; SSE42-LABEL: 'test2'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)		; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
;		;
; AVX-LABEL: 'test2'		; AVX-LABEL: 'test2'
Show All 35 Lines	;
%mask = icmp eq <4 x i32> %trigger, zeroinitializer		%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)		call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
ret void		ret void
}		}

define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {		define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
; SSE2-LABEL: 'test4'		; SSE2-LABEL: 'test4'
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer		; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)		; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
;		;
; SSE42-LABEL: 'test4'		; SSE42-LABEL: 'test4'
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer		; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)		; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
;		;
; AVX1-LABEL: 'test4'		; AVX1-LABEL: 'test4'
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	;
%mask = icmp eq <2 x i32> %trigger, zeroinitializer		%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)		call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
ret void		ret void
}		}

define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {		define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
; SSE2-LABEL: 'test7'		; SSE2-LABEL: 'test7'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)		; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
;		;
; SSE42-LABEL: 'test7'		; SSE42-LABEL: 'test7'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)		; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
;		;
; AVX-LABEL: 'test7'		; AVX-LABEL: 'test7'
Show All 9 Lines	;
%mask = icmp eq <2 x i32> %trigger, zeroinitializer		%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)		%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
ret <2 x float> %res		ret <2 x float> %res
}		}

define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {		define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; SSE2-LABEL: 'test8'		; SSE2-LABEL: 'test8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer		; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)		; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res		; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;		;
; SSE42-LABEL: 'test8'		; SSE42-LABEL: 'test8'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer		; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)		; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res		; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;		;
; AVX-LABEL: 'test8'		; AVX-LABEL: 'test8'
▲ Show 20 Lines • Show All 683 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/X86/sitofp.ll

	Show First 20 Lines • Show All 110 Lines • ▼ Show 20 Lines
	; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>			; AVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
	; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>			; AVX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
	; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512F-LABEL: 'sitofp_i64_double'			; AVX512F-LABEL: 'sitofp_i64_double'
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512DQ-LABEL: 'sitofp_i64_double'			; AVX512DQ-LABEL: 'sitofp_i64_double'
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines
	; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>			; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
	; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>			; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
	; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512F-LABEL: 'sitofp_i64_float'			; AVX512F-LABEL: 'sitofp_i64_float'
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>			; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512DQ-LABEL: 'sitofp_i64_float'			; AVX512DQ-LABEL: 'sitofp_i64_float'
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
	Show All 9 Lines

llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll

	Show All 12 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[IDXPROM:%.]] = sext i32 [[I:%.]] to i64			; CHECK-NEXT: [[IDXPROM:%.]] = sext i32 [[I:%.]] to i64
	; CHECK-NEXT: [[IDXPROM5:%.]] = sext i32 [[J:%.]] to i64			; CHECK-NEXT: [[IDXPROM5:%.]] = sext i32 [[J:%.]] to i64
	; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; CHECK: vector.ph:			; CHECK: vector.ph:
	; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]			; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
	; CHECK: vector.body:			; CHECK: vector.body:
	; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[VEC_PHI:%.]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[VEC_PHI:%.]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0			; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
	; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer			; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
	; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>			; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
	; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0			; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
	; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1			; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
	; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2			; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
	; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3			; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
	; CHECK-NEXT: [[TMP4:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]			; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
	; CHECK-NEXT: [[TMP5:%.]] = getelementptr inbounds i32, i32 [[TMP4]], i32 0			; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
	; CHECK-NEXT: [[TMP6:%.]] = bitcast i32 [[TMP5]] to <4 x i32>*			; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
	; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP6]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
	; CHECK-NEXT: [[TMP7:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]			; CHECK-NEXT: [[TMP8:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
	; CHECK-NEXT: [[TMP8:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]			; CHECK-NEXT: [[TMP9:%.]] = getelementptr inbounds i32, i32 [[TMP8]], i32 0
	; CHECK-NEXT: [[TMP9:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]			; CHECK-NEXT: [[TMP10:%.]] = bitcast i32 [[TMP9]] to <8 x i32>*
	; CHECK-NEXT: [[TMP10:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]			; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[TMP10]], align 4, !tbaa !1
	; CHECK-NEXT: [[TMP11:%.]] = load i32, i32 [[TMP7]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP11:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP12:%.]] = load i32, i32 [[TMP8]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP12:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP13:%.]] = load i32, i32 [[TMP9]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP13:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP14:%.]] = load i32, i32 [[TMP10]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP14:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0			; CHECK-NEXT: [[TMP15:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 1			; CHECK-NEXT: [[TMP16:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP13]], i32 2			; CHECK-NEXT: [[TMP17:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i32 3			; CHECK-NEXT: [[TMP18:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP18]], [[WIDE_LOAD]]			; CHECK-NEXT: [[TMP19:%.]] = load i32, i32 [[TMP11]], align 4, !tbaa !1
	; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4>			; CHECK-NEXT: [[TMP20:%.]] = load i32, i32 [[TMP12]], align 4, !tbaa !1
	; CHECK-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[TMP19]]			; CHECK-NEXT: [[TMP21:%.]] = load i32, i32 [[TMP13]], align 4, !tbaa !1
	; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4			; CHECK-NEXT: [[TMP22:%.]] = load i32, i32 [[TMP14]], align 4, !tbaa !1
	; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100			; CHECK-NEXT: [[TMP23:%.]] = load i32, i32 [[TMP15]], align 4, !tbaa !1
	; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5			; CHECK-NEXT: [[TMP24:%.]] = load i32, i32 [[TMP16]], align 4, !tbaa !1
				; CHECK-NEXT: [[TMP25:%.]] = load i32, i32 [[TMP17]], align 4, !tbaa !1
				; CHECK-NEXT: [[TMP26:%.]] = load i32, i32 [[TMP18]], align 4, !tbaa !1
				; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
				; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1
				; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2
				; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3
				; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4
				; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5
				; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6
				; CHECK-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7
				; CHECK-NEXT: [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]]
				; CHECK-NEXT: [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
				; CHECK-NEXT: [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]]
				; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
				; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
				; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
	; CHECK: middle.block:			; CHECK: middle.block:
	; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>			; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP37]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
	; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[RDX_SHUF]]			; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP37]], [[RDX_SHUF]]
	; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>			; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]			; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
	; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0			; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100			; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
				; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 96
	; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]			; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
	; CHECK: scalar.ph:			; CHECK: scalar.ph:
	; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]			; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.cond.cleanup:			; CHECK: for.cond.cleanup:
	; CHECK-NEXT: [[ADD7_LCSSA:%.]] = phi i32 [ [[ADD7:%.]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]			; CHECK-NEXT: [[ADD7_LCSSA:%.]] = phi i32 [ [[ADD7:%.]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
	; CHECK-NEXT: ret i32 [[ADD7_LCSSA]]			; CHECK-NEXT: ret i32 [[ADD7_LCSSA]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]			; CHECK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
	; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]			; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
	; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]			; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
	; CHECK-NEXT: [[TMP24:%.]] = load i32, i32 [[ARRAYIDX2]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP40:%.]] = load i32, i32 [[ARRAYIDX2]], align 4, !tbaa !1
	; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]			; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds [100 x i32], [100 x i32] [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
	; CHECK-NEXT: [[TMP25:%.]] = load i32, i32 [[ARRAYIDX6]], align 4, !tbaa !1			; CHECK-NEXT: [[TMP41:%.]] = load i32, i32 [[ARRAYIDX6]], align 4, !tbaa !1
	; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP25]], [[TMP24]]			; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]]
	; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4			; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4
	; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]]			; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]]
	; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1			; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
	; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100			; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
	; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !7			; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !7
	;			;
	entry:			entry:
	%idxprom = sext i32 %i to i64			%idxprom = sext i32 %i to i64
	Show All 32 Lines

llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S \| FileCheck %s --check-prefixes=CHECK,SSE			; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S \| FileCheck %s --check-prefixes=CHECK,SSE
	; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1			; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S \| FileCheck %s --check-prefixes=CHECK,AVX,AVX1
	; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2			; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S \| FileCheck %s --check-prefixes=CHECK,AVX,AVX2

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; These tests ensure that we do not regress due to PR31243. Note that we set			; These tests ensure that we do not regress due to PR31243. Note that we set
	; the SLP threshold to force vectorization even when not profitable.			; the SLP threshold to force vectorization even when not profitable.

	; When computing minimum sizes, if we can prove the sign bit is zero, we can			; When computing minimum sizes, if we can prove the sign bit is zero, we can
	; zero-extend the roots back to their original sizes.			; zero-extend the roots back to their original sizes.
	;			;
	define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {			define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
	; SSE-LABEL: @PR31243_zext(			; CHECK-LABEL: @PR31243_zext(
	; SSE-NEXT: entry:			; CHECK-NEXT: entry:
	; SSE-NEXT: [[TMP0:%.]] = or i8 [[V0:%.]], 1			; CHECK-NEXT: [[TMP0:%.]] = insertelement <2 x i8> undef, i8 [[V0:%.]], i32 0
	; SSE-NEXT: [[TMP1:%.]] = or i8 [[V1:%.]], 1			; CHECK-NEXT: [[TMP1:%.]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.]], i32 1
	; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64			; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
	; SSE-NEXT: [[TMP4:%.]] = getelementptr inbounds i8, i8 [[PTR:%.*]], i64 [[TMP2]]			; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
	; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64			; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
	; SSE-NEXT: [[TMP5:%.]] = getelementptr inbounds i8, i8 [[PTR]], i64 [[TMP3]]			; CHECK-NEXT: [[TMPE4:%.]] = getelementptr inbounds i8, i8 [[PTR:%.*]], i64 [[TMP4]]
	; SSE-NEXT: [[TMP6:%.]] = load i8, i8 [[TMP4]], align 1			; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
	; SSE-NEXT: [[TMP7:%.]] = load i8, i8 [[TMP5]], align 1			; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
	; SSE-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]			; CHECK-NEXT: [[TMP5:%.]] = getelementptr inbounds i8, i8 [[PTR]], i64 [[TMP6]]
	; SSE-NEXT: ret i8 [[TMP8]]			; CHECK-NEXT: [[TMP6:%.]] = load i8, i8 [[TMPE4]], align 1
	;			; CHECK-NEXT: [[TMP7:%.]] = load i8, i8 [[TMP5]], align 1
	; AVX-LABEL: @PR31243_zext(			; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
	; AVX-NEXT: entry:			; CHECK-NEXT: ret i8 [[TMP8]]
	; AVX-NEXT: [[TMP0:%.]] = insertelement <2 x i8> undef, i8 [[V0:%.]], i32 0
	; AVX-NEXT: [[TMP1:%.]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.]], i32 1
	; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
	; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
	; AVX-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
	; AVX-NEXT: [[TMPE4:%.]] = getelementptr inbounds i8, i8 [[PTR:%.*]], i64 [[TMP4]]
	; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
	; AVX-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
	; AVX-NEXT: [[TMP5:%.]] = getelementptr inbounds i8, i8 [[PTR]], i64 [[TMP6]]
	; AVX-NEXT: [[TMP6:%.]] = load i8, i8 [[TMPE4]], align 1
	; AVX-NEXT: [[TMP7:%.]] = load i8, i8 [[TMP5]], align 1
	; AVX-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
	; AVX-NEXT: ret i8 [[TMP8]]
	;			;
	entry:			entry:
	%tmp0 = zext i8 %v0 to i32			%tmp0 = zext i8 %v0 to i32
	%tmp1 = zext i8 %v1 to i32			%tmp1 = zext i8 %v1 to i32
	%tmp2 = or i32 %tmp0, 1			%tmp2 = or i32 %tmp0, 1
	%tmp3 = or i32 %tmp1, 1			%tmp3 = or i32 %tmp1, 1
	%tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2			%tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2
	%tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3			%tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3
	▲ Show 20 Lines • Show All 62 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/resched.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s \| FileCheck %s			; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s \| FileCheck %s

	%"struct.std::array" = type { [32 x i8] }			%"struct.std::array" = type { [32 x i8] }

	; Function Attrs: nounwind uwtable			; Function Attrs: nounwind uwtable
	define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {			define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {
	; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(			; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: br i1 undef, label [[IF_END50_I:%.]], label [[IF_THEN22_I:%.]]			; CHECK-NEXT: br i1 undef, label [[IF_END50_I:%.]], label [[IF_THEN22_I:%.]]
	; CHECK: if.then22.i:			; CHECK: if.then22.i:
	; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1			; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
	; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]			; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
	; CHECK-NEXT: [[TMP0:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 0			; CHECK-NEXT: [[TMP0:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 0
	; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8
	; CHECK-NEXT: [[CONV_I_I1199:%.*]] = and i8 [[TMP1]], 1
	; CHECK-NEXT: store i8 [[CONV_I_I1199]], i8* [[TMP0]], align 1
	; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
	; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8
	; CHECK-NEXT: [[CONV_1_I_I:%.*]] = and i8 [[TMP2]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 1			; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 1
	; CHECK-NEXT: store i8 [[CONV_1_I_I]], i8* [[ARRAYIDX_I_I7_1_I_I]], align 1
	; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2
	; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8
	; CHECK-NEXT: [[CONV_2_I_I:%.*]] = and i8 [[TMP3]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 2			; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 2
	; CHECK-NEXT: store i8 [[CONV_2_I_I]], i8* [[ARRAYIDX_I_I7_2_I_I]], align 1
	; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3
	; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8
	; CHECK-NEXT: [[CONV_3_I_I:%.*]] = and i8 [[TMP4]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 3			; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 3
	; CHECK-NEXT: store i8 [[CONV_3_I_I]], i8* [[ARRAYIDX_I_I7_3_I_I]], align 1
	; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4
	; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8
	; CHECK-NEXT: [[CONV_4_I_I:%.*]] = and i8 [[TMP5]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 4			; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 4
	; CHECK-NEXT: store i8 [[CONV_4_I_I]], i8* [[ARRAYIDX_I_I7_4_I_I]], align 1
	; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
	; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8
	; CHECK-NEXT: [[CONV_5_I_I:%.*]] = and i8 [[TMP6]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 5			; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 5
	; CHECK-NEXT: store i8 [[CONV_5_I_I]], i8* [[ARRAYIDX_I_I7_5_I_I]], align 1
	; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
	; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8
	; CHECK-NEXT: [[CONV_6_I_I:%.*]] = and i8 [[TMP7]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 6			; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 6
	; CHECK-NEXT: store i8 [[CONV_6_I_I]], i8* [[ARRAYIDX_I_I7_6_I_I]], align 1
	; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
	; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8
	; CHECK-NEXT: [[CONV_7_I_I:%.*]] = and i8 [[TMP8]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 7			; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 7
	; CHECK-NEXT: store i8 [[CONV_7_I_I]], i8* [[ARRAYIDX_I_I7_7_I_I]], align 1			; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
	; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
	; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
	; CHECK-NEXT: [[CONV_8_I_I:%.*]] = and i8 [[TMP9]], 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
				; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
				; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
				; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
				; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
				; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
	; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 8			; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 8
	; CHECK-NEXT: store i8 [[CONV_8_I_I]], i8* [[ARRAYIDX_I_I7_8_I_I]], align 1
	; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9
	; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8
	; CHECK-NEXT: [[CONV_9_I_I:%.*]] = and i8 [[TMP10]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 9			; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 9
	; CHECK-NEXT: store i8 [[CONV_9_I_I]], i8* [[ARRAYIDX_I_I7_9_I_I]], align 1
	; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10
	; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8
	; CHECK-NEXT: [[CONV_10_I_I:%.*]] = and i8 [[TMP11]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 10			; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 10
	; CHECK-NEXT: store i8 [[CONV_10_I_I]], i8* [[ARRAYIDX_I_I7_10_I_I]], align 1
	; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11
	; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8
	; CHECK-NEXT: [[CONV_11_I_I:%.*]] = and i8 [[TMP12]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 11			; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 11
	; CHECK-NEXT: store i8 [[CONV_11_I_I]], i8* [[ARRAYIDX_I_I7_11_I_I]], align 1			; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
	; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12			; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
	; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8			; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
	; CHECK-NEXT: [[CONV_12_I_I:%.*]] = and i8 [[TMP13]], 1			; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
				; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
	; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 12			; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 12
	; CHECK-NEXT: store i8 [[CONV_12_I_I]], i8* [[ARRAYIDX_I_I7_12_I_I]], align 1
	; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13			; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
	; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8
	; CHECK-NEXT: [[CONV_13_I_I:%.*]] = and i8 [[TMP14]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 13			; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 13
	; CHECK-NEXT: store i8 [[CONV_13_I_I]], i8* [[ARRAYIDX_I_I7_13_I_I]], align 1
	; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14			; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
	; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8
	; CHECK-NEXT: [[CONV_14_I_I:%.*]] = and i8 [[TMP15]], 1
	; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 14			; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 14
	; CHECK-NEXT: store i8 [[CONV_14_I_I]], i8* [[ARRAYIDX_I_I7_14_I_I]], align 1
	; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15			; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
	; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8			; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
	; CHECK-NEXT: [[CONV_15_I_I:%.*]] = and i8 [[TMP16]], 1			; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
				; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
				; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
				; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
				; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
				; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
				; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
				; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
				; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
				; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
				; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
				; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
				; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
				; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
				; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
				; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
				; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
				; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
				; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
				; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
				; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
				; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
				; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
				; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
				; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
				; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
				; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
				; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
				; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
	; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 15			; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.]] = getelementptr inbounds %"struct.std::array", %"struct.std::array" undef, i64 0, i32 0, i64 15
	; CHECK-NEXT: store i8 [[CONV_15_I_I]], i8* [[ARRAYIDX_I_I7_15_I_I]], align 1			; CHECK-NEXT: [[TMP45:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
				; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
	; CHECK-NEXT: unreachable			; CHECK-NEXT: unreachable
	; CHECK: if.end50.i:			; CHECK: if.end50.i:
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	br i1 undef, label %if.end50.i, label %if.then22.i			br i1 undef, label %if.end50.i, label %if.then22.i

	if.then22.i: ; preds = %entry			if.then22.i: ; preds = %entry
	▲ Show 20 Lines • Show All 86 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s \| FileCheck %s		; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s \| FileCheck %s

define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {		define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
; CHECK-LABEL: @foo(		; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 1		; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.]] = load i32, i32 [[ARRAYIDX]], align 4		; CHECK-NEXT: [[TMP0:%.]] = bitcast i32 [[ARR]] to <2 x i32>*
; CHECK-NEXT: [[ADD:%.]] = add i32 [[TMP0]], [[A1:%.]]		; CHECK-NEXT: [[TMP1:%.]] = load <2 x i32>, <2 x i32> [[TMP0]], align 4
; CHECK-NEXT: [[ADD2:%.]] = add i32 [[TMP0]], [[A2:%.]]		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[ADD4:%.]] = add i32 [[TMP0]], [[A3:%.]]		; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1>
; CHECK-NEXT: [[ADD6:%.]] = add i32 [[TMP0]], [[A4:%.]]		; CHECK-NEXT: [[TMP2:%.]] = insertelement <8 x i32> undef, i32 [[A1:%.]], i32 0
; CHECK-NEXT: [[ADD8:%.]] = add i32 [[TMP0]], [[A5:%.]]		; CHECK-NEXT: [[TMP3:%.]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.]], i32 1
; CHECK-NEXT: [[ADD10:%.]] = add i32 [[TMP0]], [[A6:%.]]		; CHECK-NEXT: [[TMP4:%.]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.]], i32 2
; CHECK-NEXT: [[TMP1:%.]] = load i32, i32 [[ARR]], align 4		; CHECK-NEXT: [[TMP5:%.]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.]], i32 3
; CHECK-NEXT: [[ADD12:%.]] = add i32 [[TMP1]], [[A7:%.]]		; CHECK-NEXT: [[TMP6:%.]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.]], i32 4
; CHECK-NEXT: [[ADD14:%.]] = add i32 [[TMP1]], [[A8:%.]]		; CHECK-NEXT: [[TMP7:%.]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.]], i32 5
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]]		; CHECK-NEXT: [[TMP8:%.]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.]], i32 6
; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]]		; CHECK-NEXT: [[TMP9:%.]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.]], i32 7
; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]]		; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]]		; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]]		; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]]		; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]]		; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]]		; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]]		; CHECK-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]]		; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]]		; CHECK-NEXT: ret i32 [[TMP11]]
; CHECK-NEXT: ret i32 [[COND44]]
;		;
entry:		entry:
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 1		%arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
%0 = load i32, i32* %arrayidx, align 4		%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %a1		%add = add i32 %0, %a1
%add2 = add i32 %0, %a2		%add2 = add i32 %0, %a2
%add4 = add i32 %0, %a3		%add4 = add i32 %0, %a3
%add6 = add i32 %0, %a4		%add6 = add i32 %0, %a4
Show All 18 Lines	entry:
%cond44 = select i1 %cmp40, i32 %cond39, i32 %add14		%cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
ret i32 %cond44		ret i32 %cond44
}		}

define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {		define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
; CHECK-LABEL: @foo1(		; CHECK-LABEL: @foo1(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 1		; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.]] = load i32, i32 [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ADD:%.]] = add i32 [[TMP0]], [[A1:%.]]
; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 2		; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 2
; CHECK-NEXT: [[TMP1:%.]] = load i32, i32 [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[ADD2:%.]] = add i32 [[TMP1]], [[A2:%.]]
; CHECK-NEXT: [[ARRAYIDX3:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 3		; CHECK-NEXT: [[ARRAYIDX3:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 3
; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[ARRAYIDX3]], align 4		; CHECK-NEXT: [[TMP0:%.]] = bitcast i32 [[ARR]] to <4 x i32>*
; CHECK-NEXT: [[ADD4:%.]] = add i32 [[TMP2]], [[A3:%.]]		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> [[TMP0]], align 4
; CHECK-NEXT: [[ADD6:%.]] = add i32 [[TMP0]], [[A4:%.]]		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
; CHECK-NEXT: [[ADD8:%.]] = add i32 [[TMP0]], [[A5:%.]]		; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 1, i32 0>
; CHECK-NEXT: [[TMP3:%.]] = load i32, i32 [[ARR]], align 4		; CHECK-NEXT: [[TMP2:%.]] = insertelement <8 x i32> undef, i32 [[A1:%.]], i32 0
; CHECK-NEXT: [[ADD10:%.]] = add i32 [[TMP3]], [[A6:%.]]		; CHECK-NEXT: [[TMP3:%.]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.]], i32 1
; CHECK-NEXT: [[ADD12:%.]] = add i32 [[TMP1]], [[A7:%.]]		; CHECK-NEXT: [[TMP4:%.]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.]], i32 2
; CHECK-NEXT: [[ADD14:%.]] = add i32 [[TMP0]], [[A8:%.]]		; CHECK-NEXT: [[TMP5:%.]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.]], i32 3
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]]		; CHECK-NEXT: [[TMP6:%.]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.]], i32 4
; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]]		; CHECK-NEXT: [[TMP7:%.]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.]], i32 5
; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]]		; CHECK-NEXT: [[TMP8:%.]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.]], i32 6
; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]]		; CHECK-NEXT: [[TMP9:%.]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.]], i32 7
; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]]		; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]]		; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]]		; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]]		; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]]		; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]]		; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]]		; CHECK-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
; CHECK-NEXT: ret i32 [[COND44]]		; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
		; CHECK-NEXT: ret i32 [[TMP11]]
;		;
entry:		entry:
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 1		%arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
%0 = load i32, i32* %arrayidx, align 4		%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %a1		%add = add i32 %0, %a1
%arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2		%arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
%1 = load i32, i32* %arrayidx1, align 4		%1 = load i32, i32* %arrayidx1, align 4
%add2 = add i32 %1, %a2		%add2 = add i32 %1, %a2
Show All 22 Lines	entry:
%cond44 = select i1 %cmp40, i32 %cond39, i32 %add14		%cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
ret i32 %cond44		ret i32 %cond44
}		}

define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {		define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
; CHECK-LABEL: @foo2(		; CHECK-LABEL: @foo2(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 3		; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 3
; CHECK-NEXT: [[TMP0:%.]] = load i32, i32 [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ADD:%.]] = add i32 [[TMP0]], [[A1:%.]]
; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 2		; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 2
; CHECK-NEXT: [[TMP1:%.]] = load i32, i32 [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[ADD2:%.]] = add i32 [[TMP1]], [[A2:%.]]
; CHECK-NEXT: [[ADD4:%.]] = add i32 [[TMP0]], [[A3:%.]]
; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[ARR]], align 4
; CHECK-NEXT: [[ADD6:%.]] = add i32 [[TMP2]], [[A4:%.]]
; CHECK-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 1		; CHECK-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds i32, i32 [[ARR]], i64 1
; CHECK-NEXT: [[TMP3:%.]] = load i32, i32 [[ARRAYIDX7]], align 4		; CHECK-NEXT: [[TMP0:%.]] = bitcast i32 [[ARR]] to <4 x i32>*
; CHECK-NEXT: [[ADD8:%.]] = add i32 [[TMP3]], [[A5:%.]]		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> [[TMP0]], align 4
; CHECK-NEXT: [[ADD10:%.]] = add i32 [[TMP2]], [[A6:%.]]		; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
; CHECK-NEXT: [[ADD12:%.]] = add i32 [[TMP1]], [[A7:%.]]		; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 2, i32 3, i32 2, i32 1, i32 3>
; CHECK-NEXT: [[ADD14:%.]] = add i32 [[TMP3]], [[A8:%.]]		; CHECK-NEXT: [[TMP2:%.]] = insertelement <8 x i32> undef, i32 [[A1:%.]], i32 0
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]]		; CHECK-NEXT: [[TMP3:%.]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.]], i32 1
; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]]		; CHECK-NEXT: [[TMP4:%.]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.]], i32 2
; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]]		; CHECK-NEXT: [[TMP5:%.]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.]], i32 3
; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]]		; CHECK-NEXT: [[TMP6:%.]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.]], i32 4
; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]]		; CHECK-NEXT: [[TMP7:%.]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.]], i32 5
; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]]		; CHECK-NEXT: [[TMP8:%.]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.]], i32 6
; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]]		; CHECK-NEXT: [[TMP9:%.]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.]], i32 7
; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]]		; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]]		; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]]		; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]]		; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]]		; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]]		; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
; CHECK-NEXT: ret i32 [[COND44]]		; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
		; CHECK-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
		; CHECK-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
		; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
		; CHECK-NEXT: ret i32 [[TMP11]]
;		;
entry:		entry:
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 3		%arrayidx = getelementptr inbounds i32, i32* %arr, i64 3
%0 = load i32, i32* %arrayidx, align 4		%0 = load i32, i32* %arrayidx, align 4
%add = add i32 %0, %a1		%add = add i32 %0, %a1
%arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2		%arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
%1 = load i32, i32* %arrayidx1, align 4		%1 = load i32, i32* %arrayidx1, align 4
%add2 = add i32 %1, %a2		%add2 = add i32 %1, %a2
Show All 25 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[TTI] Add DemandedElts to getScalarizationOverhead
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 260885

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h

llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp

llvm/lib/Target/X86/X86TargetTransformInfo.h

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Analysis/CostModel/X86/arith-fp.ll

llvm/test/Analysis/CostModel/X86/fptosi.ll

llvm/test/Analysis/CostModel/X86/fptoui.ll

llvm/test/Analysis/CostModel/X86/fround.ll

llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll

llvm/test/Analysis/CostModel/X86/load_store.ll

llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll

llvm/test/Analysis/CostModel/X86/sitofp.ll

llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll

llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll

llvm/test/Transforms/SLPVectorizer/X86/resched.ll

llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll

This is an archive of the discontinued LLVM Phabricator instance.

[TTI] Add DemandedElts to getScalarizationOverheadClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 260885

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h

llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp

llvm/lib/Target/X86/X86TargetTransformInfo.h

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Analysis/CostModel/X86/arith-fp.ll

llvm/test/Analysis/CostModel/X86/fptosi.ll

llvm/test/Analysis/CostModel/X86/fptoui.ll

llvm/test/Analysis/CostModel/X86/fround.ll

llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll

llvm/test/Analysis/CostModel/X86/load_store.ll

llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll

llvm/test/Analysis/CostModel/X86/sitofp.ll

llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll

llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll

llvm/test/Transforms/SLPVectorizer/X86/resched.ll

llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll

[TTI] Add DemandedElts to getScalarizationOverhead
ClosedPublic