Diff 556471

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 700 Lines • ▼ Show 20 Lines	public:
/// mode is legal for a load/store of any legal type.		/// mode is legal for a load/store of any legal type.
/// If target returns true in LSRWithInstrQueries(), I may be valid.		/// If target returns true in LSRWithInstrQueries(), I may be valid.
/// TODO: Handle pre/postinc as well.		/// TODO: Handle pre/postinc as well.
bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,		bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale,		bool HasBaseReg, int64_t Scale,
unsigned AddrSpace = 0,		unsigned AddrSpace = 0,
Instruction *I = nullptr) const;		Instruction *I = nullptr) const;

		/// Checks if the specified operation with the given vector type is not going
		/// to be scalarized.
		bool isLegalVectorOp(unsigned, VectorType *) const;

		/// Checks if the specified operation(intrinsic) with the given vector type is
		/// not going to be scalarized.
		bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const;

/// Return true if LSR cost of C1 is lower than C2.		/// Return true if LSR cost of C1 is lower than C2.
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,		bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) const;		const TargetTransformInfo::LSRCost &C2) const;

/// Return true if LSR major cost is number of registers. Targets which		/// Return true if LSR major cost is number of registers. Targets which
/// implement their own isLSRCostLess and unset number of registers as major		/// implement their own isLSRCostLess and unset number of registers as major
/// cost should return false, otherwise return true.		/// cost should return false, otherwise return true.
bool isNumRegsMajorCostOfLSR() const;		bool isNumRegsMajorCostOfLSR() const;
▲ Show 20 Lines • Show All 1,035 Lines • ▼ Show 20 Lines	virtual std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
std::function<void(Instruction *, unsigned, APInt, APInt &)>		std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) = 0;		SimplifyAndSetOp) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;		virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;		virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV,		virtual bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV,
int64_t BaseOffset, bool HasBaseReg,		int64_t BaseOffset, bool HasBaseReg,
int64_t Scale, unsigned AddrSpace,		int64_t Scale, unsigned AddrSpace,
Instruction *I) = 0;		Instruction *I) = 0;
		virtual bool isLegalVectorOp(unsigned, VectorType *) const = 0;

		virtual bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const = 0;

virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,		virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) = 0;		const TargetTransformInfo::LSRCost &C2) = 0;
virtual bool isNumRegsMajorCostOfLSR() = 0;		virtual bool isNumRegsMajorCostOfLSR() = 0;
virtual bool isProfitableLSRChainElement(Instruction *I) = 0;		virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
virtual bool canMacroFuseCmp() = 0;		virtual bool canMacroFuseCmp() = 0;
virtual bool canSaveCmp(Loop L, BranchInst BI, ScalarEvolution SE,		virtual bool canSaveCmp(Loop L, BranchInst BI, ScalarEvolution SE,
LoopInfo LI, DominatorTree DT, AssumptionCache *AC,		LoopInfo LI, DominatorTree DT, AssumptionCache *AC,
TargetLibraryInfo *LibInfo) = 0;		TargetLibraryInfo *LibInfo) = 0;
▲ Show 20 Lines • Show All 425 Lines • ▼ Show 20 Lines	bool isLegalICmpImmediate(int64_t Imm) override {
return Impl.isLegalICmpImmediate(Imm);		return Impl.isLegalICmpImmediate(Imm);
}		}
bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,		bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale, unsigned AddrSpace,		bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
Instruction *I) override {		Instruction *I) override {
return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,		return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
AddrSpace, I);		AddrSpace, I);
}		}
		bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const override {
		return Impl.isLegalVectorOp(Opcode, VecTy);
		}

		bool isLegalVectorIntrinsic(Intrinsic::ID Id,
		VectorType *VecTy) const override {
		return Impl.isLegalVectorIntrinsic(Id, VecTy);
		}

bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,		bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) override {		const TargetTransformInfo::LSRCost &C2) override {
return Impl.isLSRCostLess(C1, C2);		return Impl.isLSRCostLess(C1, C2);
}		}
bool isNumRegsMajorCostOfLSR() override {		bool isNumRegsMajorCostOfLSR() override {
return Impl.isNumRegsMajorCostOfLSR();		return Impl.isNumRegsMajorCostOfLSR();
}		}
bool isProfitableLSRChainElement(Instruction *I) override {		bool isProfitableLSRChainElement(Instruction *I) override {
▲ Show 20 Lines • Show All 641 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	public:

bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,		bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask) const {		const SmallBitVector &OpcodeMask) const {
return false;		return false;
}		}

bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }		bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }

		bool isLegalVectorOp(unsigned, VectorType *) const { return true; }

		bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const {
		return true;
		}

bool enableOrderedReductions() const { return false; }		bool enableOrderedReductions() const { return false; }

bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }		bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }

bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {		bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
return false;		return false;
}		}

▲ Show 20 Lines • Show All 1,078 Lines • Show Last 20 Lines

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 336 Lines • ▼ Show 20 Lines	bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
TargetLoweringBase::AddrMode AM;		TargetLoweringBase::AddrMode AM;
AM.BaseGV = BaseGV;		AM.BaseGV = BaseGV;
AM.BaseOffs = BaseOffset;		AM.BaseOffs = BaseOffset;
AM.HasBaseReg = HasBaseReg;		AM.HasBaseReg = HasBaseReg;
AM.Scale = Scale;		AM.Scale = Scale;
return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);		return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
}		}

		bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const {
		int ISD = getTLI()->InstructionOpcodeToISD(Opcode);
		EVT VT = getTLI()->getValueType(DL, VecTy);
		TargetLoweringBase::LegalizeKind LK =
		getTLI()->getTypeConversion(VecTy->getContext(), VT);
		return LK.first != TargetLoweringBase::TypeScalarizeVector &&
		getTLI()->getOperationAction(ISD, LK.second) !=
		TargetLowering::Expand;
		}

		static unsigned intrinsicIdToISD(Intrinsic::ID IID) {
		RKSimonUnsubmitted Not Done Reply Inline Actions getTypeBasedIntrinsicInstrCost already has a similar IntrinsicID->ISD conversion - merge them? RKSimon: getTypeBasedIntrinsicInstrCost already has a similar IntrinsicID->ISD conversion - merge them?
		switch (IID) {
		default:
		break;
		case Intrinsic::sqrt:
		return ISD::FSQRT;
		case Intrinsic::sin:
		return ISD::FSIN;
		case Intrinsic::cos:
		return ISD::FCOS;
		case Intrinsic::exp:
		return ISD::FEXP;
		case Intrinsic::exp2:
		return ISD::FEXP2;
		case Intrinsic::exp10:
		return ISD::FEXP10;
		case Intrinsic::log:
		return ISD::FLOG;
		case Intrinsic::log10:
		return ISD::FLOG10;
		case Intrinsic::log2:
		return ISD::FLOG2;
		case Intrinsic::fabs:
		return ISD::FABS;
		case Intrinsic::canonicalize:
		RKSimonUnsubmitted Not Done Reply Inline Actions ISD::FCOS? RKSimon: ISD::FCOS?
		return ISD::FCANONICALIZE;
		case Intrinsic::minnum:
		return ISD::FMINNUM;
		case Intrinsic::maxnum:
		return ISD::FMAXNUM;
		case Intrinsic::minimum:
		return ISD::FMINIMUM;
		case Intrinsic::maximum:
		return ISD::FMAXIMUM;
		case Intrinsic::copysign:
		return ISD::FCOPYSIGN;
		case Intrinsic::floor:
		return ISD::FFLOOR;
		case Intrinsic::ceil:
		return ISD::FCEIL;
		case Intrinsic::trunc:
		return ISD::FTRUNC;
		case Intrinsic::nearbyint:
		return ISD::FNEARBYINT;
		case Intrinsic::rint:
		return ISD::FRINT;
		case Intrinsic::round:
		return ISD::FROUND;
		case Intrinsic::roundeven:
		return ISD::FROUNDEVEN;
		case Intrinsic::pow:
		return ISD::FPOW;
		case Intrinsic::fma:
		return ISD::FMA;
		case Intrinsic::fmuladd:
		return ISD::FMA;
		case Intrinsic::experimental_constrained_fmuladd:
		return ISD::STRICT_FMA;
		case Intrinsic::ctpop:
		return ISD::CTPOP;
		case Intrinsic::ctlz:
		return ISD::CTLZ;
		case Intrinsic::cttz:
		return ISD::CTTZ;
		case Intrinsic::bswap:
		return ISD::BSWAP;
		case Intrinsic::bitreverse:
		return ISD::BITREVERSE;
		}
		return ISD::DELETED_NODE;
		}

		bool isLegalVectorIntrinsic(Intrinsic::ID Id, VectorType *VecTy) const {
		unsigned ISD = intrinsicIdToISD(Id);
		switch (intrinsicIdToISD(Id)) {
		default:
		return true;
		case ISD::FEXP:
		case ISD::FEXP2:
		case ISD::FLOG:
		case ISD::FLOG2:
		case ISD::FLOG10:
		case ISD::FSIN:
		case ISD::FCOS:
		case ISD::FSQRT:
		break;
		}

		EVT VT = getTLI()->getValueType(DL, VecTy);
		return getTLI()->getTypeAction(VecTy->getContext(), VT) !=
		TargetLoweringBase::TypeScalarizeVector &&
		getTLI()->getOperationAction(ISD, VT) != TargetLowering::Expand;
		}

unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,		unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
Type *ScalarValTy) const {		Type *ScalarValTy) const {
auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {		auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);		auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
EVT VT = getTLI()->getValueType(DL, SrcTy);		EVT VT = getTLI()->getValueType(DL, SrcTy);
if (getTLI()->isOperationLegal(ISD::STORE, VT) \|\|		if (getTLI()->isOperationLegal(ISD::STORE, VT) \|\|
getTLI()->isOperationCustom(ISD::STORE, VT))		getTLI()->isOperationCustom(ISD::STORE, VT))
return true;		return true;
▲ Show 20 Lines • Show All 1,376 Lines • ▼ Show 20 Lines	if (!Tys.empty()) {
IID == Intrinsic::vector_reduce_fmul)		IID == Intrinsic::vector_reduce_fmul)
VecTyIndex = 1;		VecTyIndex = 1;
assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");		assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);		VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
}		}

// Library call cost - other than size, make it expensive.		// Library call cost - other than size, make it expensive.
unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;		unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
unsigned ISD = 0;		// Look for intrinsics that can be lowered directly or turned into a
		// scalar intrinsic call.
		unsigned ISD = intrinsicIdToISD(IID);
		if (ISD == ISD::DELETED_NODE) {
switch (IID) {		switch (IID) {
default: {		default: {
// Scalable vectors cannot be scalarized, so return Invalid.		// Scalable vectors cannot be scalarized, so return Invalid.
if (isa<ScalableVectorType>(RetTy) \|\| any_of(Tys, [](const Type *Ty) {		if (isa<ScalableVectorType>(RetTy) \|\| any_of(Tys, [](const Type *Ty) {
return isa<ScalableVectorType>(Ty);		return isa<ScalableVectorType>(Ty);
}))		}))
return InstructionCost::getInvalid();		return InstructionCost::getInvalid();

// Assume that we need to scalarize this intrinsic.		// Assume that we need to scalarize this intrinsic.
InstructionCost ScalarizationCost =		InstructionCost ScalarizationCost =
SkipScalarizationCost ? ScalarizationCostPassed : 0;		SkipScalarizationCost ? ScalarizationCostPassed : 0;
unsigned ScalarCalls = 1;		unsigned ScalarCalls = 1;
Type *ScalarRetTy = RetTy;		Type *ScalarRetTy = RetTy;
if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {		if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
if (!SkipScalarizationCost)		if (!SkipScalarizationCost)
ScalarizationCost = getScalarizationOverhead(		ScalarizationCost = getScalarizationOverhead(
RetVTy, /Insert/ true, /Extract/ false, CostKind);		RetVTy, /Insert/ true, /Extract/ false, CostKind);
ScalarCalls = std::max(ScalarCalls,		ScalarCalls = std::max(
cast<FixedVectorType>(RetVTy)->getNumElements());		ScalarCalls, cast<FixedVectorType>(RetVTy)->getNumElements());
ScalarRetTy = RetTy->getScalarType();		ScalarRetTy = RetTy->getScalarType();
}		}
SmallVector<Type *, 4> ScalarTys;		SmallVector<Type *, 4> ScalarTys;
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {		for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
Type *Ty = Tys[i];		Type *Ty = Tys[i];
if (auto *VTy = dyn_cast<VectorType>(Ty)) {		if (auto *VTy = dyn_cast<VectorType>(Ty)) {
if (!SkipScalarizationCost)		if (!SkipScalarizationCost)
ScalarizationCost += getScalarizationOverhead(		ScalarizationCost += getScalarizationOverhead(
VTy, /Insert/ false, /Extract/ true, CostKind);		VTy, /Insert/ false, /Extract/ true, CostKind);
ScalarCalls = std::max(ScalarCalls,		ScalarCalls = std::max(
cast<FixedVectorType>(VTy)->getNumElements());		ScalarCalls, cast<FixedVectorType>(VTy)->getNumElements());
Ty = Ty->getScalarType();		Ty = Ty->getScalarType();
}		}
ScalarTys.push_back(Ty);		ScalarTys.push_back(Ty);
}		}
if (ScalarCalls == 1)		if (ScalarCalls == 1)
return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.		return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.

IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);		IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
InstructionCost ScalarCost =		InstructionCost ScalarCost =
thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);		thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);

return ScalarCalls * ScalarCost + ScalarizationCost;		return ScalarCalls * ScalarCost + ScalarizationCost;
}		}
// Look for intrinsics that can be lowered directly or turned into a scalar
// intrinsic call.
case Intrinsic::sqrt:
ISD = ISD::FSQRT;
break;
case Intrinsic::sin:
ISD = ISD::FSIN;
break;
case Intrinsic::cos:
ISD = ISD::FCOS;
break;
case Intrinsic::exp:
ISD = ISD::FEXP;
break;
case Intrinsic::exp2:
ISD = ISD::FEXP2;
break;
case Intrinsic::exp10:
ISD = ISD::FEXP10;
break;
case Intrinsic::log:
ISD = ISD::FLOG;
break;
case Intrinsic::log10:
ISD = ISD::FLOG10;
break;
case Intrinsic::log2:
ISD = ISD::FLOG2;
break;
case Intrinsic::fabs:
ISD = ISD::FABS;
break;
case Intrinsic::canonicalize:
ISD = ISD::FCANONICALIZE;
break;
case Intrinsic::minnum:
ISD = ISD::FMINNUM;
break;
case Intrinsic::maxnum:
ISD = ISD::FMAXNUM;
break;
case Intrinsic::minimum:
ISD = ISD::FMINIMUM;
break;
case Intrinsic::maximum:
ISD = ISD::FMAXIMUM;
break;
case Intrinsic::copysign:
ISD = ISD::FCOPYSIGN;
break;
case Intrinsic::floor:
ISD = ISD::FFLOOR;
break;
case Intrinsic::ceil:
ISD = ISD::FCEIL;
break;
case Intrinsic::trunc:
ISD = ISD::FTRUNC;
break;
case Intrinsic::nearbyint:
ISD = ISD::FNEARBYINT;
break;
case Intrinsic::rint:
ISD = ISD::FRINT;
break;
case Intrinsic::round:
ISD = ISD::FROUND;
break;
case Intrinsic::roundeven:
ISD = ISD::FROUNDEVEN;
break;
case Intrinsic::pow:
ISD = ISD::FPOW;
break;
case Intrinsic::fma:
ISD = ISD::FMA;
break;
case Intrinsic::fmuladd:
ISD = ISD::FMA;
break;
case Intrinsic::experimental_constrained_fmuladd:
ISD = ISD::STRICT_FMA;
break;
// FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.		// FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
case Intrinsic::lifetime_start:		case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:		case Intrinsic::lifetime_end:
case Intrinsic::sideeffect:		case Intrinsic::sideeffect:
case Intrinsic::pseudoprobe:		case Intrinsic::pseudoprobe:
case Intrinsic::arithmetic_fence:		case Intrinsic::arithmetic_fence:
return 0;		return 0;
case Intrinsic::masked_store: {		case Intrinsic::masked_store: {
Type *Ty = Tys[0];		Type *Ty = Tys[0];
Align TyAlign = thisT()->DL.getABITypeAlign(Ty);		Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,		return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign,
CostKind);		0, CostKind);
}		}
case Intrinsic::masked_load: {		case Intrinsic::masked_load: {
Type *Ty = RetTy;		Type *Ty = RetTy;
Align TyAlign = thisT()->DL.getABITypeAlign(Ty);		Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,		return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
CostKind);		CostKind);
}		}
case Intrinsic::vector_reduce_add:		case Intrinsic::vector_reduce_add:
return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
std::nullopt, CostKind);		std::nullopt, CostKind);
case Intrinsic::vector_reduce_mul:		case Intrinsic::vector_reduce_mul:
return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
std::nullopt, CostKind);		std::nullopt, CostKind);
case Intrinsic::vector_reduce_and:		case Intrinsic::vector_reduce_and:
return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
std::nullopt, CostKind);		std::nullopt, CostKind);
case Intrinsic::vector_reduce_or:		case Intrinsic::vector_reduce_or:
return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
std::nullopt, CostKind);		std::nullopt, CostKind);
case Intrinsic::vector_reduce_xor:		case Intrinsic::vector_reduce_xor:
return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
std::nullopt, CostKind);		std::nullopt, CostKind);
case Intrinsic::vector_reduce_fadd:		case Intrinsic::vector_reduce_fadd:
return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
FMF, CostKind);		FMF, CostKind);
case Intrinsic::vector_reduce_fmul:		case Intrinsic::vector_reduce_fmul:
return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,		return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
FMF, CostKind);		FMF, CostKind);
case Intrinsic::vector_reduce_smax:		case Intrinsic::vector_reduce_smax:
return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_smin:		case Intrinsic::vector_reduce_smin:
return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_umax:		case Intrinsic::vector_reduce_umax:
return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_umin:		case Intrinsic::vector_reduce_umin:
return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_fmax:		case Intrinsic::vector_reduce_fmax:
return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_fmin:		case Intrinsic::vector_reduce_fmin:
return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_fmaximum:		case Intrinsic::vector_reduce_fmaximum:
return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::vector_reduce_fminimum:		case Intrinsic::vector_reduce_fminimum:
return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,		return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
ICA.getFlags(), CostKind);		ICA.getFlags(), CostKind);
case Intrinsic::abs: {		case Intrinsic::abs: {
// abs(X) = select(icmp(X,0),X,sub(0,X))		// abs(X) = select(icmp(X,0),X,sub(0,X))
Type *CondTy = RetTy->getWithNewBitWidth(1);		Type *CondTy = RetTy->getWithNewBitWidth(1);
CmpInst::Predicate Pred = CmpInst::ICMP_SGT;		CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
InstructionCost Cost = 0;		InstructionCost Cost = 0;
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,		Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
Pred, CostKind);		Pred, CostKind);
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,		Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
Pred, CostKind);		CondTy, Pred, CostKind);
// TODO: Should we add an OperandValueProperties::OP_Zero property?		// TODO: Should we add an OperandValueProperties::OP_Zero property?
Cost += thisT()->getArithmeticInstrCost(		Cost += thisT()->getArithmeticInstrCost(
BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});		BinaryOperator::Sub, RetTy, CostKind,
		{TTI::OK_UniformConstantValue, TTI::OP_None});
return Cost;		return Cost;
}		}
case Intrinsic::smax:		case Intrinsic::smax:
case Intrinsic::smin:		case Intrinsic::smin:
case Intrinsic::umax:		case Intrinsic::umax:
case Intrinsic::umin: {		case Intrinsic::umin: {
// minmax(X,Y) = select(icmp(X,Y),X,Y)		// minmax(X,Y) = select(icmp(X,Y),X,Y)
Type *CondTy = RetTy->getWithNewBitWidth(1);		Type *CondTy = RetTy->getWithNewBitWidth(1);
bool IsUnsigned = IID == Intrinsic::umax \|\| IID == Intrinsic::umin;		bool IsUnsigned = IID == Intrinsic::umax \|\| IID == Intrinsic::umin;
CmpInst::Predicate Pred =		CmpInst::Predicate Pred =
IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;		IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
InstructionCost Cost = 0;		InstructionCost Cost = 0;
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,		Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
Pred, CostKind);		Pred, CostKind);
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,		Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
Pred, CostKind);		CondTy, Pred, CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::sadd_sat:		case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat: {		case Intrinsic::ssub_sat: {
Type *CondTy = RetTy->getWithNewBitWidth(1);		Type *CondTy = RetTy->getWithNewBitWidth(1);

Type *OpTy = StructType::create({RetTy, CondTy});		Type *OpTy = StructType::create({RetTy, CondTy});
Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat		Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
? Intrinsic::sadd_with_overflow		? Intrinsic::sadd_with_overflow
: Intrinsic::ssub_with_overflow;		: Intrinsic::ssub_with_overflow;
CmpInst::Predicate Pred = CmpInst::ICMP_SGT;		CmpInst::Predicate Pred = CmpInst::ICMP_SGT;

// SatMax -> Overflow && SumDiff < 0		// SatMax -> Overflow && SumDiff < 0
// SatMin -> Overflow && SumDiff >= 0		// SatMin -> Overflow && SumDiff >= 0
InstructionCost Cost = 0;		InstructionCost Cost = 0;
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,		IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
nullptr, ScalarizationCostPassed);		nullptr, ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);		Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,		Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
Pred, CostKind);		Pred, CostKind);
Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,		Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
CondTy, Pred, CostKind);		CondTy, Pred, CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::uadd_sat:		case Intrinsic::uadd_sat:
case Intrinsic::usub_sat: {		case Intrinsic::usub_sat: {
Type *CondTy = RetTy->getWithNewBitWidth(1);		Type *CondTy = RetTy->getWithNewBitWidth(1);

Type *OpTy = StructType::create({RetTy, CondTy});		Type *OpTy = StructType::create({RetTy, CondTy});
Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat		Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
? Intrinsic::uadd_with_overflow		? Intrinsic::uadd_with_overflow
: Intrinsic::usub_with_overflow;		: Intrinsic::usub_with_overflow;

InstructionCost Cost = 0;		InstructionCost Cost = 0;
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,		IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
nullptr, ScalarizationCostPassed);		nullptr, ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);		Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
Cost +=		Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,		thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind);		CmpInst::BAD_ICMP_PREDICATE, CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::smul_fix:		case Intrinsic::smul_fix:
case Intrinsic::umul_fix: {		case Intrinsic::umul_fix: {
unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;		unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);		Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);

unsigned ExtOp =		unsigned ExtOp =
IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;		IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
TTI::CastContextHint CCH = TTI::CastContextHint::None;		TTI::CastContextHint CCH = TTI::CastContextHint::None;

InstructionCost Cost = 0;		InstructionCost Cost = 0;
Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);		Cost +=
		2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
Cost +=		Cost +=
thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);		thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,		Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
CCH, CostKind);		CCH, CostKind);
Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,		Cost += thisT()->getArithmeticInstrCost(
CostKind,		Instruction::LShr, RetTy, CostKind,
{TTI::OK_AnyValue, TTI::OP_None},		{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_UniformConstantValue, TTI::OP_None});		{TTI::OK_UniformConstantValue, TTI::OP_None});
Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,		Cost += thisT()->getArithmeticInstrCost(
{TTI::OK_AnyValue, TTI::OP_None},		Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_UniformConstantValue, TTI::OP_None});		{TTI::OK_UniformConstantValue, TTI::OP_None});
Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);		Cost +=
		thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::sadd_with_overflow:		case Intrinsic::sadd_with_overflow:
case Intrinsic::ssub_with_overflow: {		case Intrinsic::ssub_with_overflow: {
Type *SumTy = RetTy->getContainedType(0);		Type *SumTy = RetTy->getContainedType(0);
Type *OverflowTy = RetTy->getContainedType(1);		Type *OverflowTy = RetTy->getContainedType(1);
unsigned Opcode = IID == Intrinsic::sadd_with_overflow		unsigned Opcode = IID == Intrinsic::sadd_with_overflow
? BinaryOperator::Add		? BinaryOperator::Add
: BinaryOperator::Sub;		: BinaryOperator::Sub;

// Add:		// Add:
// Overflow -> (Result < LHS) ^ (RHS < 0)		// Overflow -> (Result < LHS) ^ (RHS < 0)
// Sub:		// Sub:
// Overflow -> (Result < LHS) ^ (RHS > 0)		// Overflow -> (Result < LHS) ^ (RHS > 0)
InstructionCost Cost = 0;		InstructionCost Cost = 0;
Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);		Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
Cost += 2 * thisT()->getCmpSelInstrCost(		Cost += 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy,
Instruction::ICmp, SumTy, OverflowTy,		OverflowTy, CmpInst::ICMP_SGT,
CmpInst::ICMP_SGT, CostKind);		CostKind);
Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,		Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
CostKind);		CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::uadd_with_overflow:		case Intrinsic::uadd_with_overflow:
case Intrinsic::usub_with_overflow: {		case Intrinsic::usub_with_overflow: {
Type *SumTy = RetTy->getContainedType(0);		Type *SumTy = RetTy->getContainedType(0);
Type *OverflowTy = RetTy->getContainedType(1);		Type *OverflowTy = RetTy->getContainedType(1);
unsigned Opcode = IID == Intrinsic::uadd_with_overflow		unsigned Opcode = IID == Intrinsic::uadd_with_overflow
? BinaryOperator::Add		? BinaryOperator::Add
: BinaryOperator::Sub;		: BinaryOperator::Sub;
CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow		CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
? CmpInst::ICMP_ULT		? CmpInst::ICMP_ULT
: CmpInst::ICMP_UGT;		: CmpInst::ICMP_UGT;

InstructionCost Cost = 0;		InstructionCost Cost = 0;
Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);		Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
Cost +=		Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,		OverflowTy, Pred, CostKind);
Pred, CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::smul_with_overflow:		case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow: {		case Intrinsic::umul_with_overflow: {
Type *MulTy = RetTy->getContainedType(0);		Type *MulTy = RetTy->getContainedType(0);
Type *OverflowTy = RetTy->getContainedType(1);		Type *OverflowTy = RetTy->getContainedType(1);
unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;		unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);		Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
bool IsSigned = IID == Intrinsic::smul_with_overflow;		bool IsSigned = IID == Intrinsic::smul_with_overflow;

unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;		unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
TTI::CastContextHint CCH = TTI::CastContextHint::None;		TTI::CastContextHint CCH = TTI::CastContextHint::None;

InstructionCost Cost = 0;		InstructionCost Cost = 0;
Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);		Cost +=
		2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
Cost +=		Cost +=
thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);		thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,		Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
CCH, CostKind);		CCH, CostKind);
Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,		Cost += thisT()->getArithmeticInstrCost(
CostKind,		Instruction::LShr, ExtTy, CostKind,
{TTI::OK_AnyValue, TTI::OP_None},		{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_UniformConstantValue, TTI::OP_None});		{TTI::OK_UniformConstantValue, TTI::OP_None});

if (IsSigned)		if (IsSigned)
Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,		Cost += thisT()->getArithmeticInstrCost(
CostKind,		Instruction::AShr, MulTy, CostKind,
{TTI::OK_AnyValue, TTI::OP_None},		{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_UniformConstantValue, TTI::OP_None});		{TTI::OK_UniformConstantValue, TTI::OP_None});

Cost += thisT()->getCmpSelInstrCost(		Cost +=
BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);		thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
		CmpInst::ICMP_NE, CostKind);
return Cost;		return Cost;
}		}
case Intrinsic::fptosi_sat:		case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat: {		case Intrinsic::fptoui_sat: {
if (Tys.empty())		if (Tys.empty())
break;		break;
Type *FromTy = Tys[0];		Type *FromTy = Tys[0];
bool IsSigned = IID == Intrinsic::fptosi_sat;		bool IsSigned = IID == Intrinsic::fptosi_sat;

InstructionCost Cost = 0;		InstructionCost Cost = 0;
IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,		IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
{FromTy, FromTy});		{FromTy, FromTy});
Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);		Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,		IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
{FromTy, FromTy});		{FromTy, FromTy});
Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);		Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
Cost += thisT()->getCastInstrCost(		Cost += thisT()->getCastInstrCost(
IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,		IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
TTI::CastContextHint::None, CostKind);		TTI::CastContextHint::None, CostKind);
if (IsSigned) {		if (IsSigned) {
Type *CondTy = RetTy->getWithNewBitWidth(1);		Type *CondTy = RetTy->getWithNewBitWidth(1);
Cost += thisT()->getCmpSelInstrCost(		Cost +=
BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);		thisT()->getCmpSelInstrCost(BinaryOperator::FCmp, FromTy, CondTy,
Cost += thisT()->getCmpSelInstrCost(		CmpInst::FCMP_UNO, CostKind);
BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);		Cost +=
		thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
		CmpInst::FCMP_UNO, CostKind);
}		}
return Cost;		return Cost;
}		}
case Intrinsic::ctpop:		}
ISD = ISD::CTPOP;		} else if (ISD == ISD::CTPOP) {
// In case of legalization use TCC_Expensive. This is cheaper than a		// In case of legalization use TCC_Expensive. This is cheaper than a
// library call but still not a cheap instruction.		// library call but still not a cheap instruction.
SingleCallCost = TargetTransformInfo::TCC_Expensive;		SingleCallCost = TargetTransformInfo::TCC_Expensive;
break;
case Intrinsic::ctlz:
ISD = ISD::CTLZ;
break;
case Intrinsic::cttz:
ISD = ISD::CTTZ;
break;
case Intrinsic::bswap:
ISD = ISD::BSWAP;
break;
case Intrinsic::bitreverse:
ISD = ISD::BITREVERSE;
break;
}		}

const TargetLoweringBase *TLI = getTLI();		const TargetLoweringBase *TLI = getTLI();
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);		std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {		if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&		if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
TLI->isFAbsFree(LT.second)) {		TLI->isFAbsFree(LT.second)) {
▲ Show 20 Lines • Show All 338 Lines • Show Last 20 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 387 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::isLegalAddressingMode(Type Ty, GlobalValue BaseGV,
int64_t BaseOffset,		int64_t BaseOffset,
bool HasBaseReg, int64_t Scale,		bool HasBaseReg, int64_t Scale,
unsigned AddrSpace,		unsigned AddrSpace,
Instruction *I) const {		Instruction *I) const {
return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,		return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
Scale, AddrSpace, I);		Scale, AddrSpace, I);
}		}

		bool TargetTransformInfo::isLegalVectorOp(unsigned Opcode,
		VectorType *VecTy) const {
		return TTIImpl->isLegalVectorOp(Opcode, VecTy);
		}

		bool TargetTransformInfo::isLegalVectorIntrinsic(Intrinsic::ID Id,
		VectorType *VecTy) const {
		return TTIImpl->isLegalVectorIntrinsic(Id, VecTy);
		}

bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,		bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,
const LSRCost &C2) const {		const LSRCost &C2) const {
return TTIImpl->isLSRCostLess(C1, C2);		return TTIImpl->isLSRCostLess(C1, C2);
}		}

bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {		bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {
return TTIImpl->isNumRegsMajorCostOfLSR();		return TTIImpl->isNumRegsMajorCostOfLSR();
}		}
▲ Show 20 Lines • Show All 885 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,189 Lines • ▼ Show 20 Lines	for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();		BlockScheduling *BS = Iter.second.get();
BS->clear();		BS->clear();
}		}
MinBWs.clear();		MinBWs.clear();
InstrElementSize.clear();		InstrElementSize.clear();
UserIgnoreList = nullptr;		UserIgnoreList = nullptr;
PostponedGathers.clear();		PostponedGathers.clear();
ValueToGatherNodes.clear();		ValueToGatherNodes.clear();
		OperandsToVectorize.clear();
		}

		/// Returns the list of the operands to try to vectorize later, if the user
		/// node was not vectorized.
		ArrayRef<SmallVector<Value *>> operandsToVectorize() const {
		return OperandsToVectorize;
}		}

unsigned getTreeSize() const { return VectorizableTree.size(); }		unsigned getTreeSize() const { return VectorizableTree.size(); }

/// Perform LICM and CSE on the newly generated gather sequences.		/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();		void optimizeGatherSequence();

/// Checks if the specified gather tree entry \p TE can be represented as a		/// Checks if the specified gather tree entry \p TE can be represented as a
▲ Show 20 Lines • Show All 1,223 Lines • ▼ Show 20 Lines	return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
const_cast<TreeEntry *>(UserTE), OpIdx);		const_cast<TreeEntry *>(UserTE), OpIdx);
}		}

/// Checks if all users of \p I are the part of the vectorization tree.		/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(		bool areAllUsersVectorized(
Instruction *I,		Instruction *I,
const SmallDenseSet<Value > VectorizedVals = nullptr) const;		const SmallDenseSet<Value > VectorizedVals = nullptr) const;

		/// Checks if the list of the values worth to be vectorized and not going to
		/// be scalarized later.
		bool isLegalVectorOp(ArrayRef<Value *> VL);

/// Return information about the vector formed for the specified index		/// Return information about the vector formed for the specified index
/// of a vector of (the same) instruction.		/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);		TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);

/// \returns the cost of the vectorizable entry.		/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,		InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,		ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts);		SmallPtrSetImpl<Value *> &CheckedExtracts);
▲ Show 20 Lines • Show All 529 Lines • ▼ Show 20 Lines	#endif
SmallDenseMap<Value , TreeEntry > ScalarToTreeEntry;		SmallDenseMap<Value , TreeEntry > ScalarToTreeEntry;

/// Maps a value to the proposed vectorizable size.		/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;		SmallDenseMap<Value *, unsigned> InstrElementSize;

/// A list of scalars that we found that we need to keep as scalars.		/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;		ValueSet MustGather;

		/// A list of the operands of the nodes, which are not vectorized. These
		/// operands are the candidates for the vectorization later.
		SmallVector<SmallVector<Value *>> OperandsToVectorize;

/// A map between the vectorized entries and the last instructions in the		/// A map between the vectorized entries and the last instructions in the
/// bundles. The bundles are built in use order, not in the def order of the		/// bundles. The bundles are built in use order, not in the def order of the
/// instructions. So, we cannot rely directly on the last instruction in the		/// instructions. So, we cannot rely directly on the last instruction in the
/// bundle being the last instruction in the program order during		/// bundle being the last instruction in the program order during
/// vectorization process since the basic blocks are affected, need to		/// vectorization process since the basic blocks are affected, need to
/// pre-gather them before.		/// pre-gather them before.
DenseMap<const TreeEntry , Instruction > EntryToLastInstruction;		DenseMap<const TreeEntry , Instruction > EntryToLastInstruction;

▲ Show 20 Lines • Show All 2,899 Lines • ▼ Show 20 Lines	void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry::EntryState State = getScalarsVectorizationState(		TreeEntry::EntryState State = getScalarsVectorizationState(
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);		S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
if (State == TreeEntry::NeedToGather) {		if (State == TreeEntry::NeedToGather) {
newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,		newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,
ReuseShuffleIndicies);		ReuseShuffleIndicies);
return;		return;
}		}

		// Check if the generated vector instruction won't be scalarized later.
		if (!isLegalVectorOp(VL)) {
		LLVM_DEBUG(dbgs() << "SLP: scalarized bundle starting " << *S.OpValue
		<< ".\n");
		newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,
		ReuseShuffleIndicies);
		// Gather operands to try to vectorize them later.
		for (unsigned I = 0, End = S.MainOp->getNumOperands(); I < End; ++I) {
		auto &Operands = OperandsToVectorize.emplace_back();
		for (Value *V : VL)
		Operands.push_back(cast<Instruction>(V)->getOperand(I));
		}
		return;
		}

auto &BSRef = BlocksSchedules[BB];		auto &BSRef = BlocksSchedules[BB];
if (!BSRef)		if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);		BSRef = std::make_unique<BlockScheduling>(BB);

BlockScheduling &BS = *BSRef;		BlockScheduling &BS = *BSRef;

std::optional<ScheduleData *> Bundle =		std::optional<ScheduleData *> Bundle =
BS.tryScheduleBundle(UniqueValues, this, S);		BS.tryScheduleBundle(UniqueValues, this, S);
▲ Show 20 Lines • Show All 648 Lines • ▼ Show 20 Lines	assert((MainP == P \|\| AltP == P \|\| MainP == SwappedP \|\| AltP == SwappedP) &&
"CmpInst expected to match either main or alternate predicate or "		"CmpInst expected to match either main or alternate predicate or "
"their swap.");		"their swap.");
(void)AltP;		(void)AltP;
return MainP != P && MainP != SwappedP;		return MainP != P && MainP != SwappedP;
}		}
return I->getOpcode() == AltOp->getOpcode();		return I->getOpcode() == AltOp->getOpcode();
}		}

		bool BoUpSLP::isLegalVectorOp(ArrayRef<Value *> VL) {
		InstructionsState S = getSameOpcode(VL, *TLI);
		const unsigned Sz = VL.size();
		Value *V0 = VL.front();
		Type *ScalarTy = V0->getType();
		if (isa<StoreInst, InsertElementInst>(V0))
		return true;
		if (auto *CI = dyn_cast<CmpInst>(V0))
		ScalarTy = CI->getOperand(0)->getType();
		else if (auto *CI = dyn_cast<CastInst>(V0))
		if (!isa<BitCastInst, FPToSIInst, FPToSIInst>(CI))
		ScalarTy = CI->getSrcTy();
		if (!isValidElementType(ScalarTy))
		return false;
		auto *VecTy = FixedVectorType::get(ScalarTy, Sz);

		// If we have computed a smaller type for the expression, update VecTy so
		// that the costs will be accurate.
		const auto It = MinBWs.find(VL[0]);
		if (It != MinBWs.end())
		VecTy = FixedVectorType::get(
		IntegerType::get(F->getContext(), It->second.first), VL.size());

		unsigned ShuffleOrOp =
		S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
		switch (ShuffleOrOp) {
		case Instruction::URem:
		case Instruction::SRem:
		case Instruction::UDiv:
		case Instruction::SDiv: {
		// Check if it can be represented as shift
		SmallVector<Value *> Ops;
		for (Value *V : VL)
		Ops.push_back(cast<Instruction>(V)->getOperand(1));
		TTI::OperandValueInfo OVI = getOperandInfo(Ops);
		if (OVI.isConstant())
		return true;
		return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
		}
		case Instruction::Mul: {
		// Check if it can be represented as shift
		SmallVector<Value *> Ops;
		for (Value *V : VL)
		Ops.push_back(cast<Instruction>(V)->getOperand(1));
		TTI::OperandValueInfo OVI = getOperandInfo(Ops);
		if (OVI.isConstant())
		return true;
		return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
		}
		case Instruction::FNeg:
		case Instruction::Add:
		case Instruction::FAdd:
		case Instruction::Sub:
		case Instruction::FSub:
		case Instruction::FMul:
		case Instruction::FDiv:
		case Instruction::FRem:
		case Instruction::Shl:
		case Instruction::LShr:
		case Instruction::AShr:
		case Instruction::And:
		case Instruction::Or:
		case Instruction::Xor:
		return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
		case Instruction::Call: {
		auto *CI = cast<CallInst>(V0);
		auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
		return (VecCallCosts.first > VecCallCosts.second \|\|
		TTI->isLegalVectorIntrinsic(CI->getIntrinsicID(), VecTy));
		}
		default:
		return true;
		}
		}

TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {		TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
assert(!Ops.empty());		assert(!Ops.empty());
const auto *Op0 = Ops.front();		const auto *Op0 = Ops.front();

const bool IsConstant = all_of(Ops, [](Value *V) {		const bool IsConstant = all_of(Ops, [](Value *V) {
// TODO: We should allow undef elements here		// TODO: We should allow undef elements here
return isConstant(V) && !isa<UndefValue>(V);		return isConstant(V) && !isa<UndefValue>(V);
});		});
▲ Show 20 Lines • Show All 5,941 Lines • ▼ Show 20 Lines	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

if (Changed) {		if (Changed) {
R.optimizeGatherSequence();		R.optimizeGatherSequence();
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");		LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
}		}
return Changed;		return Changed;
}		}

		static bool vectorizeOperands(BoUpSLP &R) {
		SmallVector<SmallVector<Value *>> Operands(R.operandsToVectorize().begin(),
		R.operandsToVectorize().end());
		DenseSet<hash_code> VisitedOperands;
		bool Changed = false;
		while (!Operands.empty()) {
		SmallVector<Value *> Chain = Operands.pop_back_val();
		if (!VisitedOperands.insert(hash_value(ArrayRef(Chain))).second)
		continue;
		unsigned VF = Chain.size();
		R.buildTree(Chain);
		if (R.isTreeTinyAndNotFullyVectorizable())
		return false;
		if (R.isLoadCombineCandidate())
		return false;
		R.reorderTopToBottom();
		R.reorderBottomToTop();
		R.buildExternalUses();

		R.computeMinimumValueSizes();

		InstructionCost Cost = R.getTreeCost();

		LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF
		<< "\n");
		if (Cost < -SLPCostThreshold) {
		LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");

		using namespace ore;

		R.getORE()->emit(OptimizationRemark(SV_NAME, "OperandsVectorized",
		cast<Instruction>(Chain[0]))
		<< "Operands SLP vectorized with cost "
		<< NV("Cost", Cost) << " and with tree size "
		<< NV("TreeSize", R.getTreeSize()));

		R.vectorizeTree();
		Changed = true;
		}
		Operands.append(R.operandsToVectorize().begin(),
		R.operandsToVectorize().end());
		}
		return Changed;
		}

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,		bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned Idx, unsigned MinVF) {		unsigned Idx, unsigned MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()		LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");		<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);		const unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = Chain.size();		unsigned VF = Chain.size();

if (!isPowerOf2_32(Sz) \|\| !isPowerOf2_32(VF) \|\| VF < 2 \|\| VF < MinVF)		if (!isPowerOf2_32(Sz) \|\| !isPowerOf2_32(VF) \|\| VF < 2 \|\| VF < MinVF)
Show All 23 Lines	if (Cost < -SLPCostThreshold) {

R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",		R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
cast<StoreInst>(Chain[0]))		cast<StoreInst>(Chain[0]))
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)		<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
<< " and with tree size "		<< " and with tree size "
<< NV("TreeSize", R.getTreeSize()));		<< NV("TreeSize", R.getTreeSize()));

R.vectorizeTree();		R.vectorizeTree();
		(void)vectorizeOperands(R);
return true;		return true;
}		}

return false;		return vectorizeOperands(R);
}		}

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,		bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {		BoUpSLP &R) {
// We may run into multiple chains that merge into a single chain. We mark the		// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.		// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;		BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;		bool Changed = false;
▲ Show 20 Lines • Show All 345 Lines • ▼ Show 20 Lines	for (unsigned I = NextInst; I < MaxInst; ++I) {
<< ore::NV("TreeSize", R.getTreeSize()));		<< ore::NV("TreeSize", R.getTreeSize()));

R.vectorizeTree();		R.vectorizeTree();
// Move to the next bundle.		// Move to the next bundle.
I += VF - 1;		I += VF - 1;
NextInst = I + 1;		NextInst = I + 1;
Changed = true;		Changed = true;
}		}
		Changed \|= vectorizeOperands(R);
}		}
}		}

if (!Changed && CandidateFound) {		if (!Changed && CandidateFound) {
R.getORE()->emit([&]() {		R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)		return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
<< "List vectorization was possible but not beneficial with cost "		<< "List vectorization was possible but not beneficial with cost "
<< ore::NV("Cost", MinCost) << " >= "		<< ore::NV("Cost", MinCost) << " >= "
▲ Show 20 Lines • Show All 1,000 Lines • ▼ Show 20 Lines	for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);		Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
Instruction *InsertPt = RdxRootInst;		Instruction *InsertPt = RdxRootInst;
if (IsCmpSelMinMax)		if (IsCmpSelMinMax)
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);		InsertPt = GetCmpForMinMaxReduction(RdxRootInst);

// Vectorize a tree.		// Vectorize a tree.
Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,		Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
ReplacedExternals, InsertPt);		ReplacedExternals, InsertPt);
		(void)vectorizeOperands(V);

Builder.SetInsertPoint(InsertPt);		Builder.SetInsertPoint(InsertPt);

// To prevent poison from leaking across what used to be sequential,		// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be		// safe, scalar boolean logic operations, the reduction operand must be
// frozen.		// frozen.
if (isBoolLogicOp(RdxRootInst))		if (isBoolLogicOp(RdxRootInst))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);		VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
▲ Show 20 Lines • Show All 1,494 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

	Show All 18 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.sin.f32(float %vecext.1)			%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 952 Lines • ▼ Show 20 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.cos.f32(float %vecext)			%1 = tail call fast float @llvm.cos.f32(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.cos.f32(float %vecext.1)			%2 = tail call fast float @llvm.cos.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

	Show All 18 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.sin.f32(float %vecext.1)			%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 952 Lines • ▼ Show 20 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.cos.f32(float %vecext)			%1 = tail call fast float @llvm.cos.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.cos.f32(float %vecext.1)			%2 = tail call fast float @llvm.cos.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,GFX9 %s			; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,GFX9 %s
	; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,VI %s			; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,VI %s

	; FIXME: Should still like to vectorize the memory operations for VI			; FIXME: Should still like to vectorize the memory operations for VI
				arsenmUnsubmitted Not Done Reply Inline Actions This fixme was fixed but has now been unfixed arsenm: This fixme was fixed but has now been unfixed
				ABataevAuthorUnsubmitted Done Reply Inline Actions Hmm, I checked VI tests and loooks like fmul <2 x half> tests are scalarized in the end (https://godbolt.org/z/hPcjoETWT). So, it does not worth it to vectorize them. ABataev: Hmm, I checked VI tests and loooks like fmul <2 x half> tests are scalarized in the end (https…

	; Simple 3-pair chain with loads and stores			; Simple 3-pair chain with loads and stores
	define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {			define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {
	; GCN-LABEL: @test1_as_3_3_3_v2f16(			; GFX9-LABEL: @test1_as_3_3_3_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2			; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]			; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
	; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @test1_as_3_3_3_v2f16(
				; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
				; VI-NEXT: [[I1:%.]] = load half, ptr addrspace(3) [[B:%.]], align 2
				; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
				; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
				; VI-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
				; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%mul = fmul half %i0, %i1			%mul = fmul half %i0, %i1
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
	%i4 = load half, ptr addrspace(3) %arrayidx4, align 2			%i4 = load half, ptr addrspace(3) %arrayidx4, align 2
	%mul5 = fmul half %i3, %i4			%mul5 = fmul half %i3, %i4
	store half %mul, ptr addrspace(3) %c, align 2			store half %mul, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %mul5, ptr addrspace(3) %arrayidx5, align 2			store half %mul5, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {			define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {
	; GCN-LABEL: @test1_as_3_0_0(			; GFX9-LABEL: @test1_as_3_0_0(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr [[B:%.]], align 2			; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr [[B:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]			; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
	; GCN-NEXT: store <2 x half> [[TMP5]], ptr [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @test1_as_3_0_0(
				; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
				; VI-NEXT: [[I1:%.]] = load half, ptr [[B:%.]], align 2
				; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
				; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1
				; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
				; VI-NEXT: store half [[MUL]], ptr [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr %b, align 2			%i1 = load half, ptr %b, align 2
	%mul = fmul half %i0, %i1			%mul = fmul half %i0, %i1
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1
	%i4 = load half, ptr %arrayidx4, align 2			%i4 = load half, ptr %arrayidx4, align 2
	%mul5 = fmul half %i3, %i4			%mul5 = fmul half %i3, %i4
	store half %mul, ptr %c, align 2			store half %mul, ptr %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr %c, i64 1
	store half %mul5, ptr %arrayidx5, align 2			store half %mul5, ptr %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {			define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {
	; GCN-LABEL: @test1_as_0_0_3_v2f16(			; GFX9-LABEL: @test1_as_0_0_3_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr [[B:%.]], align 2			; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr [[B:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]			; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
	; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @test1_as_0_0_3_v2f16(
				; VI-NEXT: [[I0:%.]] = load half, ptr [[A:%.]], align 2
				; VI-NEXT: [[I1:%.]] = load half, ptr [[B:%.]], align 2
				; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX3]], align 2
				; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1
				; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
				; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr %a, align 2			%i0 = load half, ptr %a, align 2
	%i1 = load half, ptr %b, align 2			%i1 = load half, ptr %b, align 2
	%mul = fmul half %i0, %i1			%mul = fmul half %i0, %i1
	%arrayidx3 = getelementptr inbounds half, ptr %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr %a, i64 1
	%i3 = load half, ptr %arrayidx3, align 2			%i3 = load half, ptr %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1
	%i4 = load half, ptr %arrayidx4, align 2			%i4 = load half, ptr %arrayidx4, align 2
	%mul5 = fmul half %i3, %i4			%mul5 = fmul half %i3, %i4
	store half %mul, ptr addrspace(3) %c, align 2			store half %mul, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %mul5, ptr addrspace(3) %arrayidx5, align 2			store half %mul5, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {			define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
	; GCN-LABEL: @test1_fma_v2f16(			; GCN-LABEL: @test1_fma_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2			; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[TMP6:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2			; GCN-NEXT: [[TMP3:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2
	; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])			; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
	; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%i2 = load half, ptr addrspace(3) %c, align 2			%i2 = load half, ptr addrspace(3) %c, align 2
	%fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)			%fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
	%i4 = load half, ptr addrspace(3) %arrayidx4, align 2			%i4 = load half, ptr addrspace(3) %arrayidx4, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	%i5 = load half, ptr addrspace(3) %arrayidx5, align 2			%i5 = load half, ptr addrspace(3) %arrayidx5, align 2
	%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)			%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
	store half %fma0, ptr addrspace(3) %d, align 2			store half %fma0, ptr addrspace(3) %d, align 2
	%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1			%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
	store half %fma1, ptr addrspace(3) %arrayidx6, align 2			store half %fma1, ptr addrspace(3) %arrayidx6, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {			define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {
	; GCN-LABEL: @mul_scalar_v2f16(			; GFX9-LABEL: @mul_scalar_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP3:%.]] = insertelement <2 x half> poison, half [[SCALAR:%.]], i32 0			; GFX9-NEXT: [[TMP2:%.]] = insertelement <2 x half> poison, half [[SCALAR:%.]], i32 0
	; GCN-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer			; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer
	; GCN-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]]			; GFX9-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]]
	; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @mul_scalar_v2f16(
				; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
				; VI-NEXT: [[MUL:%.]] = fmul half [[I0]], [[SCALAR:%.]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[SCALAR]]
				; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%mul = fmul half %i0, %scalar			%mul = fmul half %i0, %scalar
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%mul5 = fmul half %i3, %scalar			%mul5 = fmul half %i3, %scalar
	store half %mul, ptr addrspace(3) %c, align 2			store half %mul, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %mul5, ptr addrspace(3) %arrayidx5, align 2			store half %mul5, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {			define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
	; GCN-LABEL: @fabs_v2f16(			; GCN-LABEL: @fabs_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])			; GCN-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
	; GCN-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%fabs0 = call half @llvm.fabs.f16(half %i0)			%fabs0 = call half @llvm.fabs.f16(half %i0)
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%fabs1 = call half @llvm.fabs.f16(half %i3)			%fabs1 = call half @llvm.fabs.f16(half %i3)
	store half %fabs0, ptr addrspace(3) %c, align 2			store half %fabs0, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %fabs1, ptr addrspace(3) %arrayidx5, align 2			store half %fabs1, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {			define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
	; GCN-LABEL: @test1_fabs_fma_v2f16(			; GCN-LABEL: @test1_fabs_fma_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2			; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[TMP6:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2			; GCN-NEXT: [[TMP3:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2
	; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])			; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
	; GCN-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])			; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
	; GCN-NEXT: store <2 x half> [[TMP8]], ptr addrspace(3) [[D:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%i2 = load half, ptr addrspace(3) %c, align 2			%i2 = load half, ptr addrspace(3) %c, align 2
	%i0.fabs = call half @llvm.fabs.f16(half %i0)			%i0.fabs = call half @llvm.fabs.f16(half %i0)

	%fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)			%fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
	Show All 13 Lines
	}			}

	define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {			define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
	; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(			; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
	; GCN-NEXT: [[I1:%.]] = load half, ptr addrspace(3) [[B:%.]], align 2			; GCN-NEXT: [[I1:%.]] = load half, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])			; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
	; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1			; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
	; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2			; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2			; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0			; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
	; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1			; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1
	; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]])			; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]])
	; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%i2 = load half, ptr addrspace(3) %c, align 2			%i2 = load half, ptr addrspace(3) %c, align 2
	%i1.fabs = call half @llvm.fabs.f16(half %i1)			%i1.fabs = call half @llvm.fabs.f16(half %i1)

	%fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)			%fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
	%i4 = load half, ptr addrspace(3) %arrayidx4, align 2			%i4 = load half, ptr addrspace(3) %arrayidx4, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	%i5 = load half, ptr addrspace(3) %arrayidx5, align 2			%i5 = load half, ptr addrspace(3) %arrayidx5, align 2
	%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)			%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
	store half %fma0, ptr addrspace(3) %d, align 2			store half %fma0, ptr addrspace(3) %d, align 2
	%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1			%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
	store half %fma1, ptr addrspace(3) %arrayidx6, align 2			store half %fma1, ptr addrspace(3) %arrayidx6, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {			define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
	; GFX9-LABEL: @canonicalize_v2f16(			; GFX9-LABEL: @canonicalize_v2f16(
	; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]])			; GFX9-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]])
	; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
	; GFX9-NEXT: ret void			; GFX9-NEXT: ret void
	;			;
	; VI-LABEL: @canonicalize_v2f16(			; VI-LABEL: @canonicalize_v2f16(
	; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2			; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
	; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])			; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
	; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1			; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
	; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2			; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
	; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])			; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
	Show All 22 Lines

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

	Show First 20 Lines • Show All 202 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; CHECK-NEXT: ret <4 x float> [[VECINS_31]]			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
				; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	; DEFAULT-LABEL: define <4 x float> @int_exp_4x			; DEFAULT-LABEL: define <4 x float> @int_exp_4x
	; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {			; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
	; DEFAULT-NEXT: entry:			; DEFAULT-NEXT: entry:
	; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])			; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])
	; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])			; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
	; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])			; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
	; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]			; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
				; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.exp.f32(float %vecext)			%1 = tail call fast float @llvm.exp.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.exp.f32(float %vecext.1)			%2 = tail call fast float @llvm.exp.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; CHECK-NEXT: ret <4 x float> [[VECINS_31]]			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
				; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	; DEFAULT-LABEL: define <4 x float> @int_log_4x			; DEFAULT-LABEL: define <4 x float> @int_log_4x
	; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {			; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
	; DEFAULT-NEXT: entry:			; DEFAULT-NEXT: entry:
	; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])			; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])
	; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])			; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
	; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])			; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
	; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]			; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
				; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.log.f32(float %vecext)			%1 = tail call fast float @llvm.log.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.log.f32(float %vecext.1)			%2 = tail call fast float @llvm.log.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; CHECK-NEXT: ret <4 x float> [[VECINS_31]]			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	; DEFAULT-LABEL: define <4 x float> @int_sin_4x			; DEFAULT-LABEL: define <4 x float> @int_sin_4x
	; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {			; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
	; DEFAULT-NEXT: entry:			; DEFAULT-NEXT: entry:
	; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]			; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.sin.f32(float %vecext.1)			%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 351 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 \| FileCheck %s			; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 \| FileCheck %s

	define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {			define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
	; CHECK-LABEL: @sdiv_v8i32_undefs(			; CHECK-LABEL: @sdiv_v8i32_undefs(
	; CHECK-NEXT: ret <8 x i32> poison			; CHECK-NEXT: [[A1:%.]] = extractelement <8 x i32> [[A:%.]], i64 1
				; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
				; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4
				; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
				; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4
				; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
				; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
				; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
				; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
				; CHECK-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
				; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
				; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
				; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
				; CHECK-NEXT: ret <8 x i32> [[R71]]
	;			;
	%a0 = extractelement <8 x i32> %a, i32 0			%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1			%a1 = extractelement <8 x i32> %a, i32 1
	%a2 = extractelement <8 x i32> %a, i32 2			%a2 = extractelement <8 x i32> %a, i32 2
	%a3 = extractelement <8 x i32> %a, i32 3			%a3 = extractelement <8 x i32> %a, i32 3
	%a4 = extractelement <8 x i32> %a, i32 4			%a4 = extractelement <8 x i32> %a, i32 4
	%a5 = extractelement <8 x i32> %a, i32 5			%a5 = extractelement <8 x i32> %a, i32 5
	%a6 = extractelement <8 x i32> %a, i32 6			%a6 = extractelement <8 x i32> %a, i32 6
	Show All 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -passes=slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck %s			; RUN: opt -passes=slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck %s

	declare i64 @may_inf_loop_ro() nounwind readonly			declare i64 @may_inf_loop_ro() nounwind readonly
	declare i64 @may_inf_loop_rw() nounwind			declare i64 @may_inf_loop_rw() nounwind
	declare i64 @may_throw() willreturn			declare i64 @may_throw() willreturn

	; Base case with no interesting control dependencies			; Base case with no interesting control dependencies
	define void @test_no_control(ptr %a, ptr %b, ptr %c) {			define void @test_no_control(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test_no_control(			; CHECK-LABEL: @test_no_control(
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%ca2 = getelementptr i64, ptr %c, i32 1			%ca2 = getelementptr i64, ptr %c, i32 1
	%c2 = load i64, ptr %ca2			%c2 = load i64, ptr %ca2
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test1(ptr %a, ptr %b, ptr %c) {			define void @test1(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test1(			; CHECK-LABEL: @test1(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test2(ptr %a, ptr %b, ptr %c) {			define void @test2(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test2(			; CHECK-LABEL: @test2(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()

	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test3(ptr %a, ptr %b, ptr %c) {			define void @test3(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test3(			; CHECK-LABEL: @test3(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1

	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test4(ptr %a, ptr %b, ptr %c) {			define void @test4(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test4(			; CHECK-LABEL: @test4(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1

	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test5(ptr %a, ptr %b, ptr %c) {			define void @test5(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test5(			; CHECK-LABEL: @test5(
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test6(ptr %a, ptr %b, ptr %c) {			define void @test6(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test6(			; CHECK-LABEL: @test6(
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP5:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	call i64 @may_inf_loop_ro()			call i64 @may_inf_loop_ro()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	Show All 16 Lines
	; previously exist.			; previously exist.
	define void @test7(ptr %a, ptr %b, ptr %c) {			define void @test7(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test7(			; CHECK-LABEL: @test7(
	; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1			; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4			; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4
	; CHECK-NEXT: store i64 0, ptr [[A]], align 4			; CHECK-NEXT: store i64 0, ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	store i64 0, ptr %a			store i64 0, ptr %a
	call i64 @may_inf_loop_ro()			call i64 @may_inf_loop_ro()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	Show All 12 Lines
	; Same as test7, but with a throwing call			; Same as test7, but with a throwing call
	define void @test8(ptr %a, ptr %b, ptr %c) {			define void @test8(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test8(			; CHECK-LABEL: @test8(
	; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1			; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4			; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4
	; CHECK-NEXT: store i64 0, ptr [[A]], align 4			; CHECK-NEXT: store i64 0, ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	store i64 0, ptr %a			store i64 0, ptr %a
	call i64 @may_throw() readonly			call i64 @may_throw() readonly
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	Show All 12 Lines
	; Same as test8, but with a readwrite maythrow call			; Same as test8, but with a readwrite maythrow call
	define void @test9(ptr %a, ptr %b, ptr %c) {			define void @test9(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test9(			; CHECK-LABEL: @test9(
	; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1			; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4			; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4
	; CHECK-NEXT: store i64 0, ptr [[A]], align 4			; CHECK-NEXT: store i64 0, ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw()
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	store i64 0, ptr %a			store i64 0, ptr %a
	call i64 @may_throw()			call i64 @may_throw()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%ca2 = getelementptr i64, ptr %c, i32 1			%ca2 = getelementptr i64, ptr %c, i32 1
	%c2 = load i64, ptr %ca2			%c2 = load i64, ptr %ca2
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	; A variant of test7 which shows the same problem with a non-load instruction			; A variant of test7 which shows the same problem with a non-load instruction
	define void @test10(ptr %a, ptr %b, ptr %c) {			define void @test10(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test10(			; CHECK-LABEL: @test10(
	; CHECK-NEXT: [[V1:%.]] = load i64, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[TMP2]]
	; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[V1]]
	; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4			; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP3:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[V2]]			; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[TMP4]]
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0			; CHECK-NEXT: [[TMP5:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1			; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[U2]], i32 1
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP5]]
				; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%u1 = udiv i64 200, %v1			%u1 = udiv i64 200, %v1
	store i64 %u1, ptr %a			store i64 %u1, ptr %a
	Show All 15 Lines
	; Variant of test10 block invariant operands to the udivs			; Variant of test10 block invariant operands to the udivs
	; FIXME: This is wrong, we're hoisting a faulting udiv above an infinite loop.			; FIXME: This is wrong, we're hoisting a faulting udiv above an infinite loop.
	define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {			define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {
	; CHECK-LABEL: @test11(			; CHECK-LABEL: @test11(
	; CHECK-NEXT: [[U1:%.]] = udiv i64 200, [[X:%.]]			; CHECK-NEXT: [[U1:%.]] = udiv i64 200, [[X:%.]]
	; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[U2:%.]] = udiv i64 200, [[Y:%.]]			; CHECK-NEXT: [[U2:%.]] = udiv i64 200, [[Y:%.]]
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%u1 = udiv i64 200, %x			%u1 = udiv i64 200, %x
	store i64 %u1, ptr %b			store i64 %u1, ptr %b
	call i64 @may_inf_loop_ro()			call i64 @may_inf_loop_ro()
	%u2 = udiv i64 200, %y			%u2 = udiv i64 200, %y

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	Show All 10 Lines

llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 \| FileCheck %s			; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 \| FileCheck %s

	define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {			define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
	; CHECK-LABEL: @test(			; CHECK-LABEL: @test(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = insertelement <4 x i64> poison, i64 [[P0:%.]], i32 0			; CHECK-NEXT: [[TMP0:%.]] = insertelement <4 x i64> poison, i64 [[P0:%.]], i32 0
	; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.]], i32 1			; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.]], i32 1
	; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.]], i32 2			; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.]], i32 2
	; CHECK-NEXT: [[TMP3:%.]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.]], i32 3			; CHECK-NEXT: [[TMP3:%.]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.]], i32 3
	; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]			; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
	; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
	; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]			; CHECK-NEXT: [[D0:%.*]] = sdiv i64 [[P0]], [[P0]]
	; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]			; CHECK-NEXT: [[D1:%.*]] = sdiv i64 [[P1]], [[P1]]
	; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]			; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]]
	; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>			; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]]
	; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>			; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[D0]], i32 0
	; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[D1]], i32 1
	; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>			; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[D2]], i32 2
	; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]			; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[D3]], i32 3
	; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>			; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i64> [[TMP5]], [[TMP9]]
				; CHECK-NEXT: [[TMP11:%.*]] = shl <4 x i64> [[TMP4]], [[TMP10]]
				; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
				; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[D0]], i32 2
				; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
				; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[D1]], i32 2
				; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP13]], [[TMP15]]
				; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
	; CHECK-NEXT: br label [[BB:%.*]]			; CHECK-NEXT: br label [[BB:%.*]]
	; CHECK: bb:			; CHECK: bb:
	; CHECK-NEXT: [[TMP15:%.]] = phi <4 x i32> [ [[TMP16:%.]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]			; CHECK-NEXT: [[TMP18:%.]] = phi <4 x i32> [ [[TMP19:%.]], [[BB]] ], [ [[TMP17]], [[ENTRY:%.*]] ]
	; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>			; CHECK-NEXT: [[TMP19]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
	; CHECK-NEXT: br label [[BB]]			; CHECK-NEXT: br label [[BB]]
	;			;
	entry:			entry:
	%a0 = add i64 %p0, %p0			%a0 = add i64 %p0, %p0
	%a1 = add i64 %p1, %p1			%a1 = add i64 %p1, %p1
	%a2 = add i64 %p2, %p2			%a2 = add i64 %p2, %p2
	%a3 = add i64 %p3, %p3			%a3 = add i64 %p3, %p3
	%m0 = mul i64 %p0, %p0			%m0 = mul i64 %p0, %p0
	Show All 36 Lines

llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=slp-vectorizer -S \| FileCheck %s			; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=slp-vectorizer -S \| FileCheck %s
	; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=inject-tli-mappings,slp-vectorizer -vector-library=SVML -S \| FileCheck %s --check-prefix=VECLIB			; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=inject-tli-mappings,slp-vectorizer -vector-library=SVML -S \| FileCheck %s --check-prefix=VECLIB
				RKSimonUnsubmitted Not Done Reply Inline Actions do we have a fveclib that we can add test coverage for that will vectorize sin calls? RKSimon: do we have a fveclib that we can add test coverage for that will vectorize sin calls?
				ABataevAuthorUnsubmitted Done Reply Inline Actions There is AArch64/accelerate-vector-functions-inseltpoison.ll for AArch64, will add SVML for x86 ABataev: There is AArch64/accelerate-vector-functions-inseltpoison.ll for AArch64, will add SVML for x86

	@src = common global [8 x double] zeroinitializer, align 64			@src = common global [8 x double] zeroinitializer, align 64
	@dst = common global [8 x double] zeroinitializer, align 64			@dst = common global [8 x double] zeroinitializer, align 64

	declare double @llvm.sqrt.f64(double)			declare double @llvm.sqrt.f64(double)
	declare double @llvm.sin.f64(double)			declare double @llvm.sin.f64(double)

	define void @test() {			define void @test() {
	; CHECK-LABEL: @test(			; CHECK-LABEL: @test(
	; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8			; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8
	; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8			; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
	; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8			; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
	; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8			; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
	; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8			; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
	; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8			; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
	; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8			; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
	; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8			; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
	; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0			; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]])
	; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1			; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]])
	; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])			; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0			; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1			; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
	; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]])			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A4]], i32 1
	; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
	; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
	; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])			; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A5]], i32 1
	; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0			; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]])
	; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0
	; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])			; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[SIN3]], i32 1
	; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]			; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
	; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]			; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[SIN0]], i32 0
	; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]			; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[SIN2]], i32 1
	; CHECK-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8			; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP6]]
				; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP12]]
				; CHECK-NEXT: store <2 x double> [[TMP13]], ptr @dst, align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	; VECLIB-LABEL: @test(			; VECLIB-LABEL: @test(
	; VECLIB-NEXT: [[A0:%.*]] = load double, ptr @src, align 8			; VECLIB-NEXT: [[A0:%.*]] = load double, ptr @src, align 8
	; VECLIB-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8			; VECLIB-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
	; VECLIB-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8			; VECLIB-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
	; VECLIB-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8			; VECLIB-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
	; VECLIB-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8			; VECLIB-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
	▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Introduce isLegalVectorOp to check if the vector instruction is going to be scalarized.
Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 556471

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll

llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll

llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Introduce isLegalVectorOp to check if the vector instruction is going to be scalarized.Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 556471

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll

llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll

llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll

[SLP]Introduce isLegalVectorOp to check if the vector instruction is going to be scalarized.
Needs ReviewPublic