Diff 538224

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 700 Lines • ▼ Show 20 Lines	public:
/// mode is legal for a load/store of any legal type.		/// mode is legal for a load/store of any legal type.
/// If target returns true in LSRWithInstrQueries(), I may be valid.		/// If target returns true in LSRWithInstrQueries(), I may be valid.
/// TODO: Handle pre/postinc as well.		/// TODO: Handle pre/postinc as well.
bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,		bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale,		bool HasBaseReg, int64_t Scale,
unsigned AddrSpace = 0,		unsigned AddrSpace = 0,
Instruction *I = nullptr) const;		Instruction *I = nullptr) const;

		/// Checks if the specified operation with the given vector type is not going
		/// to be scalarized.
		bool isLegalVectorOp(unsigned, VectorType *) const;

		/// Checks if the specified operation(intrinsic) with the given vector type is
		/// not going to be scalarized.
		bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const;

/// Return true if LSR cost of C1 is lower than C2.		/// Return true if LSR cost of C1 is lower than C2.
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,		bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) const;		const TargetTransformInfo::LSRCost &C2) const;

/// Return true if LSR major cost is number of registers. Targets which		/// Return true if LSR major cost is number of registers. Targets which
/// implement their own isLSRCostLess and unset number of registers as major		/// implement their own isLSRCostLess and unset number of registers as major
/// cost should return false, otherwise return true.		/// cost should return false, otherwise return true.
bool isNumRegsMajorCostOfLSR() const;		bool isNumRegsMajorCostOfLSR() const;
▲ Show 20 Lines • Show All 1,035 Lines • ▼ Show 20 Lines	virtual std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
std::function<void(Instruction *, unsigned, APInt, APInt &)>		std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) = 0;		SimplifyAndSetOp) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;		virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;		virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV,		virtual bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV,
int64_t BaseOffset, bool HasBaseReg,		int64_t BaseOffset, bool HasBaseReg,
int64_t Scale, unsigned AddrSpace,		int64_t Scale, unsigned AddrSpace,
Instruction *I) = 0;		Instruction *I) = 0;
		virtual bool isLegalVectorOp(unsigned, VectorType *) const = 0;

		virtual bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const = 0;

virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,		virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) = 0;		const TargetTransformInfo::LSRCost &C2) = 0;
virtual bool isNumRegsMajorCostOfLSR() = 0;		virtual bool isNumRegsMajorCostOfLSR() = 0;
virtual bool isProfitableLSRChainElement(Instruction *I) = 0;		virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
virtual bool canMacroFuseCmp() = 0;		virtual bool canMacroFuseCmp() = 0;
virtual bool canSaveCmp(Loop L, BranchInst BI, ScalarEvolution SE,		virtual bool canSaveCmp(Loop L, BranchInst BI, ScalarEvolution SE,
LoopInfo LI, DominatorTree DT, AssumptionCache *AC,		LoopInfo LI, DominatorTree DT, AssumptionCache *AC,
TargetLibraryInfo *LibInfo) = 0;		TargetLibraryInfo *LibInfo) = 0;
▲ Show 20 Lines • Show All 425 Lines • ▼ Show 20 Lines	bool isLegalICmpImmediate(int64_t Imm) override {
return Impl.isLegalICmpImmediate(Imm);		return Impl.isLegalICmpImmediate(Imm);
}		}
bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,		bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale, unsigned AddrSpace,		bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
Instruction *I) override {		Instruction *I) override {
return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,		return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
AddrSpace, I);		AddrSpace, I);
}		}
		bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const override {
		return Impl.isLegalVectorOp(Opcode, VecTy);
		}

		bool isLegalVectorIntrinsic(Intrinsic::ID Id,
		VectorType *VecTy) const override {
		return Impl.isLegalVectorIntrinsic(Id, VecTy);
		}

bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,		bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) override {		const TargetTransformInfo::LSRCost &C2) override {
return Impl.isLSRCostLess(C1, C2);		return Impl.isLSRCostLess(C1, C2);
}		}
bool isNumRegsMajorCostOfLSR() override {		bool isNumRegsMajorCostOfLSR() override {
return Impl.isNumRegsMajorCostOfLSR();		return Impl.isNumRegsMajorCostOfLSR();
}		}
bool isProfitableLSRChainElement(Instruction *I) override {		bool isProfitableLSRChainElement(Instruction *I) override {
▲ Show 20 Lines • Show All 641 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	public:

bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,		bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask) const {		const SmallBitVector &OpcodeMask) const {
return false;		return false;
}		}

bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }		bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }

		bool isLegalVectorOp(unsigned, VectorType *) const { return true; }

		bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const {
		return true;
		}

bool enableOrderedReductions() const { return false; }		bool enableOrderedReductions() const { return false; }

bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }		bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }

bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {		bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
return false;		return false;
}		}

▲ Show 20 Lines • Show All 1,081 Lines • Show Last 20 Lines

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 336 Lines • ▼ Show 20 Lines	bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
TargetLoweringBase::AddrMode AM;		TargetLoweringBase::AddrMode AM;
AM.BaseGV = BaseGV;		AM.BaseGV = BaseGV;
AM.BaseOffs = BaseOffset;		AM.BaseOffs = BaseOffset;
AM.HasBaseReg = HasBaseReg;		AM.HasBaseReg = HasBaseReg;
AM.Scale = Scale;		AM.Scale = Scale;
return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);		return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
}		}

		bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const {
		int ISD = getTLI()->InstructionOpcodeToISD(Opcode);
		EVT VT = getTLI()->getValueType(DL, VecTy);
		TargetLoweringBase::LegalizeKind LK =
		getTLI()->getTypeConversion(VecTy->getContext(), VT);
		return LK.first != TargetLoweringBase::TypeScalarizeVector &&
		getTLI()->getOperationAction(ISD, LK.second) !=
		TargetLowering::Expand;
		}

		bool isLegalVectorIntrinsic(Intrinsic::ID Id, VectorType *VecTy) const {
		RKSimonUnsubmitted Not Done Reply Inline Actions getTypeBasedIntrinsicInstrCost already has a similar IntrinsicID->ISD conversion - merge them? RKSimon: getTypeBasedIntrinsicInstrCost already has a similar IntrinsicID->ISD conversion - merge them?
		unsigned ISD = ISD::DELETED_NODE;
		switch (Id) {
		default:
		break;
		case Intrinsic::exp:
		ISD = ISD::FEXP;
		break;
		case Intrinsic::exp2:
		ISD = ISD::FEXP2;
		break;
		case Intrinsic::log:
		ISD = ISD::FLOG;
		break;
		case Intrinsic::log2:
		ISD = ISD::FLOG2;
		break;
		case Intrinsic::log10:
		ISD = ISD::FLOG10;
		break;
		case Intrinsic::sin:
		ISD = ISD::FSIN;
		break;
		case Intrinsic::cos:
		ISD = ISD::FSIN;
		RKSimonUnsubmitted Not Done Reply Inline Actions ISD::FCOS? RKSimon: ISD::FCOS?
		break;
		case Intrinsic::umax:
		ISD = ISD::UMAX;
		break;
		case Intrinsic::umin:
		ISD = ISD::UMIN;
		break;
		case Intrinsic::sqrt:
		ISD = ISD::FSQRT;
		break;
		}

		if (ISD == ISD::DELETED_NODE)
		return true;
		EVT VT = getTLI()->getValueType(DL, VecTy);
		return getTLI()->getTypeAction(VecTy->getContext(), VT) !=
		TargetLoweringBase::TypeScalarizeVector &&
		getTLI()->getOperationAction(ISD, VT) != TargetLowering::Expand;
		}

unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,		unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
Type *ScalarValTy) const {		Type *ScalarValTy) const {
auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {		auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);		auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
EVT VT = getTLI()->getValueType(DL, SrcTy);		EVT VT = getTLI()->getValueType(DL, SrcTy);
if (getTLI()->isOperationLegal(ISD::STORE, VT) \|\|		if (getTLI()->isOperationLegal(ISD::STORE, VT) \|\|
getTLI()->isOperationCustom(ISD::STORE, VT))		getTLI()->isOperationCustom(ISD::STORE, VT))
return true;		return true;
▲ Show 20 Lines • Show All 2,121 Lines • Show Last 20 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 387 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::isLegalAddressingMode(Type Ty, GlobalValue BaseGV,
int64_t BaseOffset,		int64_t BaseOffset,
bool HasBaseReg, int64_t Scale,		bool HasBaseReg, int64_t Scale,
unsigned AddrSpace,		unsigned AddrSpace,
Instruction *I) const {		Instruction *I) const {
return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,		return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
Scale, AddrSpace, I);		Scale, AddrSpace, I);
}		}

		bool TargetTransformInfo::isLegalVectorOp(unsigned Opcode,
		VectorType *VecTy) const {
		return TTIImpl->isLegalVectorOp(Opcode, VecTy);
		}

		bool TargetTransformInfo::isLegalVectorIntrinsic(Intrinsic::ID Id,
		VectorType *VecTy) const {
		return TTIImpl->isLegalVectorIntrinsic(Id, VecTy);
		}

bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,		bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,
const LSRCost &C2) const {		const LSRCost &C2) const {
return TTIImpl->isLSRCostLess(C1, C2);		return TTIImpl->isLSRCostLess(C1, C2);
}		}

bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {		bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {
return TTIImpl->isNumRegsMajorCostOfLSR();		return TTIImpl->isNumRegsMajorCostOfLSR();
}		}
▲ Show 20 Lines • Show All 885 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,182 Lines • ▼ Show 20 Lines	for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();		BlockScheduling *BS = Iter.second.get();
BS->clear();		BS->clear();
}		}
MinBWs.clear();		MinBWs.clear();
InstrElementSize.clear();		InstrElementSize.clear();
UserIgnoreList = nullptr;		UserIgnoreList = nullptr;
PostponedGathers.clear();		PostponedGathers.clear();
ValueToGatherNodes.clear();		ValueToGatherNodes.clear();
		OperandsToVectorize.clear();
		}

		/// Returns the list of the operands to try to vectorize later, if the user
		/// node was not vectorized.
		ArrayRef<SmallVector<Value *>> operandsToVectorize() const {
		return OperandsToVectorize;
}		}

unsigned getTreeSize() const { return VectorizableTree.size(); }		unsigned getTreeSize() const { return VectorizableTree.size(); }

/// Perform LICM and CSE on the newly generated gather sequences.		/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();		void optimizeGatherSequence();

/// Checks if the specified gather tree entry \p TE can be represented as a		/// Checks if the specified gather tree entry \p TE can be represented as a
▲ Show 20 Lines • Show All 1,222 Lines • ▼ Show 20 Lines	const TreeEntry getVectorizedOperand(const TreeEntry UserTE,
return const_cast<BoUpSLP *>(this)->getVectorizedOperand(		return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
const_cast<TreeEntry *>(UserTE), OpIdx);		const_cast<TreeEntry *>(UserTE), OpIdx);
}		}

/// Checks if all users of \p I are the part of the vectorization tree.		/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I,		bool areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const;		ArrayRef<Value *> VectorizedVals) const;

		/// Checks if the list of the values worth to be vectorized and not going to
		/// be scalarized later.
		bool isLegalVectorOp(ArrayRef<Value *> VL);

/// Return information about the vector formed for the specified index		/// Return information about the vector formed for the specified index
/// of a vector of (the same) instruction.		/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,		TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,
unsigned OpIdx);		unsigned OpIdx);

/// \returns the cost of the vectorizable entry.		/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,		InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,		ArrayRef<Value *> VectorizedVals,
▲ Show 20 Lines • Show All 503 Lines • ▼ Show 20 Lines	#endif
SmallDenseMap<Value , TreeEntry > ScalarToTreeEntry;		SmallDenseMap<Value , TreeEntry > ScalarToTreeEntry;

/// Maps a value to the proposed vectorizable size.		/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;		SmallDenseMap<Value *, unsigned> InstrElementSize;

/// A list of scalars that we found that we need to keep as scalars.		/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;		ValueSet MustGather;

		/// A list of the operands of the nodes, which are not vectorized. These
		/// operands are the candidates for the vectorization later.
		SmallVector<SmallVector<Value *>> OperandsToVectorize;

/// A map between the vectorized entries and the last instructions in the		/// A map between the vectorized entries and the last instructions in the
/// bundles. The bundles are built in use order, not in the def order of the		/// bundles. The bundles are built in use order, not in the def order of the
/// instructions. So, we cannot rely directly on the last instruction in the		/// instructions. So, we cannot rely directly on the last instruction in the
/// bundle being the last instruction in the program order during		/// bundle being the last instruction in the program order during
/// vectorization process since the basic blocks are affected, need to		/// vectorization process since the basic blocks are affected, need to
/// pre-gather them before.		/// pre-gather them before.
DenseMap<const TreeEntry , Instruction > EntryToLastInstruction;		DenseMap<const TreeEntry , Instruction > EntryToLastInstruction;

▲ Show 20 Lines • Show All 2,486 Lines • ▼ Show 20 Lines	if (isa<CatchSwitchInst>(BB->getTerminator())) {
newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx);		newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx);
return;		return;
}		}

// Check that every instruction appears once in this bundle.		// Check that every instruction appears once in this bundle.
if (!TryToFindDuplicates(S))		if (!TryToFindDuplicates(S))
return;		return;

		// Check if the generated vector instruction won't be scalarized later.
		if (!isLegalVectorOp(VL)) {
		LLVM_DEBUG(dbgs() << "SLP: scalarized bundle starting " << *S.OpValue
		<< ".\n");
		newTreeEntry(VL, std::nullopt /not vectorized/, S, UserTreeIdx,
		ReuseShuffleIndicies);
		// Gather operands to try to vectorize them later.
		for (unsigned I = 0, End = S.MainOp->getNumOperands(); I < End; ++I) {
		auto &Operands = OperandsToVectorize.emplace_back();
		for (Value *V : VL)
		Operands.push_back(cast<Instruction>(V)->getOperand(I));
		}
		return;
		}

auto &BSRef = BlocksSchedules[BB];		auto &BSRef = BlocksSchedules[BB];
if (!BSRef)		if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);		BSRef = std::make_unique<BlockScheduling>(BB);

BlockScheduling &BS = *BSRef;		BlockScheduling &BS = *BSRef;

std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);		std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
#ifdef EXPENSIVE_CHECKS		#ifdef EXPENSIVE_CHECKS
▲ Show 20 Lines • Show All 902 Lines • ▼ Show 20 Lines	assert((MainP == P \|\| AltP == P \|\| MainP == SwappedP \|\| AltP == SwappedP) &&
"CmpInst expected to match either main or alternate predicate or "		"CmpInst expected to match either main or alternate predicate or "
"their swap.");		"their swap.");
(void)AltP;		(void)AltP;
return MainP != P && MainP != SwappedP;		return MainP != P && MainP != SwappedP;
}		}
return I->getOpcode() == AltOp->getOpcode();		return I->getOpcode() == AltOp->getOpcode();
}		}

		bool BoUpSLP::isLegalVectorOp(ArrayRef<Value *> VL) {
		InstructionsState S = getSameOpcode(VL, *TLI);
		const unsigned Sz = VL.size();
		Value *V0 = VL.front();
		Type *ScalarTy = V0->getType();
		if (isa<StoreInst, InsertElementInst>(V0))
		return true;
		if (auto *CI = dyn_cast<CmpInst>(V0))
		ScalarTy = CI->getOperand(0)->getType();
		else if (auto *CI = dyn_cast<CastInst>(V0))
		if (!isa<BitCastInst, FPToSIInst, FPToSIInst>(CI))
		ScalarTy = CI->getSrcTy();
		if (!isValidElementType(ScalarTy))
		return false;
		auto *VecTy = FixedVectorType::get(ScalarTy, Sz);

		// If we have computed a smaller type for the expression, update VecTy so
		// that the costs will be accurate.
		const auto It = MinBWs.find(VL[0]);
		if (It != MinBWs.end())
		VecTy = FixedVectorType::get(
		IntegerType::get(F->getContext(), It->second.first), VL.size());

		unsigned ShuffleOrOp =
		S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
		switch (ShuffleOrOp) {
		case Instruction::URem:
		case Instruction::SRem:
		case Instruction::UDiv:
		case Instruction::SDiv: {
		// Check if it can be represented as shift
		TTI::OperandValueInfo OVI = getOperandInfo(VL, 1);
		if (OVI.isConstant())
		return true;
		return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
		}
		case Instruction::Mul: {
		// Check if it can be represented as shift
		TTI::OperandValueInfo OVI = getOperandInfo(VL, 1);
		if (OVI.isConstant())
		return true;
		return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
		}
		case Instruction::FNeg:
		case Instruction::Add:
		case Instruction::FAdd:
		case Instruction::Sub:
		case Instruction::FSub:
		case Instruction::FMul:
		case Instruction::FDiv:
		case Instruction::FRem:
		case Instruction::Shl:
		case Instruction::LShr:
		case Instruction::AShr:
		case Instruction::And:
		case Instruction::Or:
		case Instruction::Xor:
		return TTI->isLegalVectorOp(ShuffleOrOp, VecTy);
		case Instruction::Call: {
		auto *CI = cast<CallInst>(V0);
		auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
		return (VecCallCosts.first > VecCallCosts.second \|\|
		TTI->isLegalVectorIntrinsic(CI->getIntrinsicID(), VecTy));
		}
		default:
		return true;
		}
		}

TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,		TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
unsigned OpIdx) {		unsigned OpIdx) {
assert(!VL.empty());		assert(!VL.empty());
const auto I0 = cast<Instruction>(find_if(VL, Instruction::classof));		const auto I0 = cast<Instruction>(find_if(VL, Instruction::classof));
const auto *Op0 = I0->getOperand(OpIdx);		const auto *Op0 = I0->getOperand(OpIdx);

const bool IsConstant = all_of(VL, [&](Value *V) {		const bool IsConstant = all_of(VL, [&](Value *V) {
// TODO: We should allow undef elements here		// TODO: We should allow undef elements here
▲ Show 20 Lines • Show All 5,837 Lines • ▼ Show 20 Lines	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

if (Changed) {		if (Changed) {
R.optimizeGatherSequence();		R.optimizeGatherSequence();
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");		LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
}		}
return Changed;		return Changed;
}		}

		static bool vectorizeOperands(BoUpSLP &R) {
		SmallVector<SmallVector<Value *>> Operands(R.operandsToVectorize().begin(),
		R.operandsToVectorize().end());
		DenseSet<hash_code> VisitedOperands;
		bool Changed = false;
		while (!Operands.empty()) {
		SmallVector<Value *> Chain = Operands.pop_back_val();
		if (!VisitedOperands.insert(hash_value(ArrayRef(Chain))).second)
		continue;
		unsigned VF = Chain.size();
		R.buildTree(Chain);
		if (R.isTreeTinyAndNotFullyVectorizable())
		return false;
		if (R.isLoadCombineCandidate())
		return false;
		R.reorderTopToBottom();
		R.reorderBottomToTop();
		R.buildExternalUses();

		R.computeMinimumValueSizes();

		InstructionCost Cost = R.getTreeCost();

		LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF
		<< "\n");
		if (Cost < -SLPCostThreshold) {
		LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");

		using namespace ore;

		R.getORE()->emit(OptimizationRemark(SV_NAME, "OperandsVectorized",
		cast<Instruction>(Chain[0]))
		<< "Operands SLP vectorized with cost "
		<< NV("Cost", Cost) << " and with tree size "
		<< NV("TreeSize", R.getTreeSize()));

		R.vectorizeTree();
		Changed = true;
		}
		Operands.append(R.operandsToVectorize().begin(),
		R.operandsToVectorize().end());
		}
		return Changed;
		}

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,		bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned Idx, unsigned MinVF) {		unsigned Idx, unsigned MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()		LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");		<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);		const unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = Chain.size();		unsigned VF = Chain.size();

if (!isPowerOf2_32(Sz) \|\| !isPowerOf2_32(VF) \|\| VF < 2 \|\| VF < MinVF)		if (!isPowerOf2_32(Sz) \|\| !isPowerOf2_32(VF) \|\| VF < 2 \|\| VF < MinVF)
Show All 23 Lines	if (Cost < -SLPCostThreshold) {

R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",		R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
cast<StoreInst>(Chain[0]))		cast<StoreInst>(Chain[0]))
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)		<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
<< " and with tree size "		<< " and with tree size "
<< NV("TreeSize", R.getTreeSize()));		<< NV("TreeSize", R.getTreeSize()));

R.vectorizeTree();		R.vectorizeTree();
		(void)vectorizeOperands(R);
return true;		return true;
}		}

return false;		return vectorizeOperands(R);
}		}

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,		bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {		BoUpSLP &R) {
// We may run into multiple chains that merge into a single chain. We mark the		// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.		// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;		BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;		bool Changed = false;
▲ Show 20 Lines • Show All 278 Lines • ▼ Show 20 Lines	for (unsigned I = NextInst; I < MaxInst; ++I) {
<< ore::NV("TreeSize", R.getTreeSize()));		<< ore::NV("TreeSize", R.getTreeSize()));

R.vectorizeTree();		R.vectorizeTree();
// Move to the next bundle.		// Move to the next bundle.
I += VF - 1;		I += VF - 1;
NextInst = I + 1;		NextInst = I + 1;
Changed = true;		Changed = true;
}		}
		Changed \|= vectorizeOperands(R);
}		}
}		}

if (!Changed && CandidateFound) {		if (!Changed && CandidateFound) {
R.getORE()->emit([&]() {		R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)		return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
<< "List vectorization was possible but not beneficial with cost "		<< "List vectorization was possible but not beneficial with cost "
<< ore::NV("Cost", MinCost) << " >= "		<< ore::NV("Cost", MinCost) << " >= "
▲ Show 20 Lines • Show All 978 Lines • ▼ Show 20 Lines	for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);		Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
Instruction *InsertPt = RdxRootInst;		Instruction *InsertPt = RdxRootInst;
if (IsCmpSelMinMax)		if (IsCmpSelMinMax)
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);		InsertPt = GetCmpForMinMaxReduction(RdxRootInst);

// Vectorize a tree.		// Vectorize a tree.
Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,		Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
ReplacedExternals, InsertPt);		ReplacedExternals, InsertPt);
		(void)vectorizeOperands(V);

Builder.SetInsertPoint(InsertPt);		Builder.SetInsertPoint(InsertPt);

// To prevent poison from leaking across what used to be sequential,		// To prevent poison from leaking across what used to be sequential,
// safe, scalar boolean logic operations, the reduction operand must be		// safe, scalar boolean logic operations, the reduction operand must be
// frozen.		// frozen.
if (isBoolLogicOp(RdxRootInst))		if (isBoolLogicOp(RdxRootInst))
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);		VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
▲ Show 20 Lines • Show All 1,462 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

	Show All 18 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.sin.f32(float %vecext.1)			%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 952 Lines • ▼ Show 20 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.cos.f32(float %vecext)			%1 = tail call fast float @llvm.cos.f32(float %vecext)
	%vecins = insertelement <4 x float> poison, float %1, i32 0			%vecins = insertelement <4 x float> poison, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.cos.f32(float %vecext.1)			%2 = tail call fast float @llvm.cos.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

	Show All 18 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.sin.f32(float %vecext.1)			%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 952 Lines • ▼ Show 20 Lines
	; NOACCELERATE-NEXT: entry:			; NOACCELERATE-NEXT: entry:
	; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16			; NOACCELERATE-NEXT: [[TMP0:%.]] = load <4 x float>, ptr [[A:%.]], align 16
	; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])			; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
	; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])			; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
	; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])			; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
	; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]			; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
				; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.cos.f32(float %vecext)			%1 = tail call fast float @llvm.cos.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.cos.f32(float %vecext.1)			%2 = tail call fast float @llvm.cos.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,GFX9 %s			; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,GFX9 %s
	; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,VI %s			; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s \| FileCheck -check-prefixes=GCN,VI %s

	; FIXME: Should still like to vectorize the memory operations for VI			; FIXME: Should still like to vectorize the memory operations for VI
				arsenmUnsubmitted Not Done Reply Inline Actions This fixme was fixed but has now been unfixed arsenm: This fixme was fixed but has now been unfixed
				ABataevAuthorUnsubmitted Done Reply Inline Actions Hmm, I checked VI tests and loooks like fmul <2 x half> tests are scalarized in the end (https://godbolt.org/z/hPcjoETWT). So, it does not worth it to vectorize them. ABataev: Hmm, I checked VI tests and loooks like fmul <2 x half> tests are scalarized in the end (https…

	; Simple 3-pair chain with loads and stores			; Simple 3-pair chain with loads and stores
	define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {			define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {
	; GCN-LABEL: @test1_as_3_3_3_v2f16(			; GFX9-LABEL: @test1_as_3_3_3_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2			; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]			; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
	; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @test1_as_3_3_3_v2f16(
				; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
				; VI-NEXT: [[I1:%.]] = load half, ptr addrspace(3) [[B:%.]], align 2
				; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
				; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
				; VI-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
				; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%mul = fmul half %i0, %i1			%mul = fmul half %i0, %i1
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
	%i4 = load half, ptr addrspace(3) %arrayidx4, align 2			%i4 = load half, ptr addrspace(3) %arrayidx4, align 2
	%mul5 = fmul half %i3, %i4			%mul5 = fmul half %i3, %i4
	store half %mul, ptr addrspace(3) %c, align 2			store half %mul, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %mul5, ptr addrspace(3) %arrayidx5, align 2			store half %mul5, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {			define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {
	; GCN-LABEL: @test1_as_3_0_0(			; GFX9-LABEL: @test1_as_3_0_0(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr [[B:%.]], align 2			; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr [[B:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]			; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
	; GCN-NEXT: store <2 x half> [[TMP5]], ptr [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @test1_as_3_0_0(
				; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
				; VI-NEXT: [[I1:%.]] = load half, ptr [[B:%.]], align 2
				; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
				; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1
				; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
				; VI-NEXT: store half [[MUL]], ptr [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr %b, align 2			%i1 = load half, ptr %b, align 2
	%mul = fmul half %i0, %i1			%mul = fmul half %i0, %i1
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1
	%i4 = load half, ptr %arrayidx4, align 2			%i4 = load half, ptr %arrayidx4, align 2
	%mul5 = fmul half %i3, %i4			%mul5 = fmul half %i3, %i4
	store half %mul, ptr %c, align 2			store half %mul, ptr %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr %c, i64 1
	store half %mul5, ptr %arrayidx5, align 2			store half %mul5, ptr %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {			define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {
	; GCN-LABEL: @test1_as_0_0_3_v2f16(			; GFX9-LABEL: @test1_as_0_0_3_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr [[B:%.]], align 2			; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr [[B:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]			; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
	; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @test1_as_0_0_3_v2f16(
				; VI-NEXT: [[I0:%.]] = load half, ptr [[A:%.]], align 2
				; VI-NEXT: [[I1:%.]] = load half, ptr [[B:%.]], align 2
				; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX3]], align 2
				; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1
				; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]]
				; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr %a, align 2			%i0 = load half, ptr %a, align 2
	%i1 = load half, ptr %b, align 2			%i1 = load half, ptr %b, align 2
	%mul = fmul half %i0, %i1			%mul = fmul half %i0, %i1
	%arrayidx3 = getelementptr inbounds half, ptr %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr %a, i64 1
	%i3 = load half, ptr %arrayidx3, align 2			%i3 = load half, ptr %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr %b, i64 1
	%i4 = load half, ptr %arrayidx4, align 2			%i4 = load half, ptr %arrayidx4, align 2
	%mul5 = fmul half %i3, %i4			%mul5 = fmul half %i3, %i4
	store half %mul, ptr addrspace(3) %c, align 2			store half %mul, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %mul5, ptr addrspace(3) %arrayidx5, align 2			store half %mul5, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {			define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
	; GCN-LABEL: @test1_fma_v2f16(			; GCN-LABEL: @test1_fma_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2			; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[TMP6:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2			; GCN-NEXT: [[TMP3:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2
	; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])			; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
	; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%i2 = load half, ptr addrspace(3) %c, align 2			%i2 = load half, ptr addrspace(3) %c, align 2
	%fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)			%fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
	%i4 = load half, ptr addrspace(3) %arrayidx4, align 2			%i4 = load half, ptr addrspace(3) %arrayidx4, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	%i5 = load half, ptr addrspace(3) %arrayidx5, align 2			%i5 = load half, ptr addrspace(3) %arrayidx5, align 2
	%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)			%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
	store half %fma0, ptr addrspace(3) %d, align 2			store half %fma0, ptr addrspace(3) %d, align 2
	%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1			%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
	store half %fma1, ptr addrspace(3) %arrayidx6, align 2			store half %fma1, ptr addrspace(3) %arrayidx6, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {			define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {
	; GCN-LABEL: @mul_scalar_v2f16(			; GFX9-LABEL: @mul_scalar_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP3:%.]] = insertelement <2 x half> poison, half [[SCALAR:%.]], i32 0			; GFX9-NEXT: [[TMP2:%.]] = insertelement <2 x half> poison, half [[SCALAR:%.]], i32 0
	; GCN-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer			; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer
	; GCN-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]]			; GFX9-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]]
	; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GFX9-NEXT: ret void
				;
				; VI-LABEL: @mul_scalar_v2f16(
				; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
				; VI-NEXT: [[MUL:%.]] = fmul half [[I0]], [[SCALAR:%.]]
				; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
				; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
				; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[SCALAR]]
				; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2
				; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1
				; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2
				; VI-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%mul = fmul half %i0, %scalar			%mul = fmul half %i0, %scalar
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%mul5 = fmul half %i3, %scalar			%mul5 = fmul half %i3, %scalar
	store half %mul, ptr addrspace(3) %c, align 2			store half %mul, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %mul5, ptr addrspace(3) %arrayidx5, align 2			store half %mul5, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {			define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
	; GCN-LABEL: @fabs_v2f16(			; GCN-LABEL: @fabs_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])			; GCN-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
	; GCN-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%fabs0 = call half @llvm.fabs.f16(half %i0)			%fabs0 = call half @llvm.fabs.f16(half %i0)
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%fabs1 = call half @llvm.fabs.f16(half %i3)			%fabs1 = call half @llvm.fabs.f16(half %i3)
	store half %fabs0, ptr addrspace(3) %c, align 2			store half %fabs0, ptr addrspace(3) %c, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	store half %fabs1, ptr addrspace(3) %arrayidx5, align 2			store half %fabs1, ptr addrspace(3) %arrayidx5, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {			define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
	; GCN-LABEL: @test1_fabs_fma_v2f16(			; GCN-LABEL: @test1_fabs_fma_v2f16(
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2			; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[TMP6:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2			; GCN-NEXT: [[TMP3:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2
	; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])			; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
	; GCN-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])			; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
	; GCN-NEXT: store <2 x half> [[TMP8]], ptr addrspace(3) [[D:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%i2 = load half, ptr addrspace(3) %c, align 2			%i2 = load half, ptr addrspace(3) %c, align 2
	%i0.fabs = call half @llvm.fabs.f16(half %i0)			%i0.fabs = call half @llvm.fabs.f16(half %i0)

	%fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)			%fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
	Show All 13 Lines
	}			}

	define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {			define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
	; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(			; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
	; GCN-NEXT: [[I1:%.]] = load half, ptr addrspace(3) [[B:%.]], align 2			; GCN-NEXT: [[I1:%.]] = load half, ptr addrspace(3) [[B:%.]], align 2
	; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])			; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
	; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1			; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
	; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2			; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
	; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GCN-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GCN-NEXT: [[TMP4:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2			; GCN-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[C:%.]], align 2
	; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0			; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
	; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1			; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1
	; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]])			; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]])
	; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2			; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
	; GCN-NEXT: ret void			; GCN-NEXT: ret void
	;			;
	%i0 = load half, ptr addrspace(3) %a, align 2			%i0 = load half, ptr addrspace(3) %a, align 2
	%i1 = load half, ptr addrspace(3) %b, align 2			%i1 = load half, ptr addrspace(3) %b, align 2
	%i2 = load half, ptr addrspace(3) %c, align 2			%i2 = load half, ptr addrspace(3) %c, align 2
	%i1.fabs = call half @llvm.fabs.f16(half %i1)			%i1.fabs = call half @llvm.fabs.f16(half %i1)

	%fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)			%fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
	%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1			%arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
	%i3 = load half, ptr addrspace(3) %arrayidx3, align 2			%i3 = load half, ptr addrspace(3) %arrayidx3, align 2
	%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1			%arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
	%i4 = load half, ptr addrspace(3) %arrayidx4, align 2			%i4 = load half, ptr addrspace(3) %arrayidx4, align 2
	%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1			%arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
	%i5 = load half, ptr addrspace(3) %arrayidx5, align 2			%i5 = load half, ptr addrspace(3) %arrayidx5, align 2
	%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)			%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
	store half %fma0, ptr addrspace(3) %d, align 2			store half %fma0, ptr addrspace(3) %d, align 2
	%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1			%arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
	store half %fma1, ptr addrspace(3) %arrayidx6, align 2			store half %fma1, ptr addrspace(3) %arrayidx6, align 2
	ret void			ret void
	}			}

	define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {			define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
	; GFX9-LABEL: @canonicalize_v2f16(			; GFX9-LABEL: @canonicalize_v2f16(
	; GFX9-NEXT: [[TMP2:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2			; GFX9-NEXT: [[TMP1:%.]] = load <2 x half>, ptr addrspace(3) [[A:%.]], align 2
	; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]])			; GFX9-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]])
	; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2			; GFX9-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
	; GFX9-NEXT: ret void			; GFX9-NEXT: ret void
	;			;
	; VI-LABEL: @canonicalize_v2f16(			; VI-LABEL: @canonicalize_v2f16(
	; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2			; VI-NEXT: [[I0:%.]] = load half, ptr addrspace(3) [[A:%.]], align 2
	; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])			; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
	; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1			; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1
	; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2			; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2
	; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])			; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
	Show All 22 Lines

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

	Show First 20 Lines • Show All 202 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; CHECK-NEXT: ret <4 x float> [[VECINS_31]]			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
				; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	; DEFAULT-LABEL: define <4 x float> @int_exp_4x			; DEFAULT-LABEL: define <4 x float> @int_exp_4x
	; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {			; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
	; DEFAULT-NEXT: entry:			; DEFAULT-NEXT: entry:
	; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])			; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT]])
	; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])			; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
	; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])			; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
	; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]			; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
				; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.exp.f32(float %vecext)			%1 = tail call fast float @llvm.exp.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.exp.f32(float %vecext.1)			%2 = tail call fast float @llvm.exp.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; CHECK-NEXT: ret <4 x float> [[VECINS_31]]			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
				; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	; DEFAULT-LABEL: define <4 x float> @int_log_4x			; DEFAULT-LABEL: define <4 x float> @int_log_4x
	; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {			; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
	; DEFAULT-NEXT: entry:			; DEFAULT-NEXT: entry:
	; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])			; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT]])
	; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])			; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
	; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])			; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
	; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]			; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
				; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.log.f32(float %vecext)			%1 = tail call fast float @llvm.log.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.log.f32(float %vecext.1)			%2 = tail call fast float @llvm.log.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; CHECK-NEXT: ret <4 x float> [[VECINS_31]]			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	; DEFAULT-LABEL: define <4 x float> @int_sin_4x			; DEFAULT-LABEL: define <4 x float> @int_sin_4x
	; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {			; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
	; DEFAULT-NEXT: entry:			; DEFAULT-NEXT: entry:
	; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16			; DEFAULT-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
	; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0			; DEFAULT-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
	; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])			; DEFAULT-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
	; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0			; DEFAULT-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
	; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1			; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
	; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])			; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
	; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1			; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
	; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>			; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
	; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])			; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
	; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>			; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
	; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>			; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
	; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]			; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
				; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
				; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
	;			;
	entry:			entry:
	%0 = load <4 x float>, ptr %a, align 16			%0 = load <4 x float>, ptr %a, align 16
	%vecext = extractelement <4 x float> %0, i32 0			%vecext = extractelement <4 x float> %0, i32 0
	%1 = tail call fast float @llvm.sin.f32(float %vecext)			%1 = tail call fast float @llvm.sin.f32(float %vecext)
	%vecins = insertelement <4 x float> undef, float %1, i32 0			%vecins = insertelement <4 x float> undef, float %1, i32 0
	%vecext.1 = extractelement <4 x float> %0, i32 1			%vecext.1 = extractelement <4 x float> %0, i32 1
	%2 = tail call fast float @llvm.sin.f32(float %vecext.1)			%2 = tail call fast float @llvm.sin.f32(float %vecext.1)
	▲ Show 20 Lines • Show All 351 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 \| FileCheck %s			; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S -slp-threshold=-10000 \| FileCheck %s

	define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {			define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
	; CHECK-LABEL: @sdiv_v8i32_undefs(			; CHECK-LABEL: @sdiv_v8i32_undefs(
	; CHECK-NEXT: ret <8 x i32> poison			; CHECK-NEXT: [[A1:%.]] = extractelement <8 x i32> [[A:%.]], i64 1
				; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
				; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4
				; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
				; CHECK-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
				; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4
				; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
				; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
				; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
				; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
				; CHECK-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
				; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
				; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
				; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
				; CHECK-NEXT: ret <8 x i32> [[R71]]
	;			;
	%a0 = extractelement <8 x i32> %a, i32 0			%a0 = extractelement <8 x i32> %a, i32 0
	%a1 = extractelement <8 x i32> %a, i32 1			%a1 = extractelement <8 x i32> %a, i32 1
	%a2 = extractelement <8 x i32> %a, i32 2			%a2 = extractelement <8 x i32> %a, i32 2
	%a3 = extractelement <8 x i32> %a, i32 3			%a3 = extractelement <8 x i32> %a, i32 3
	%a4 = extractelement <8 x i32> %a, i32 4			%a4 = extractelement <8 x i32> %a, i32 4
	%a5 = extractelement <8 x i32> %a, i32 5			%a5 = extractelement <8 x i32> %a, i32 5
	%a6 = extractelement <8 x i32> %a, i32 6			%a6 = extractelement <8 x i32> %a, i32 6
	Show All 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -passes=slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck %s			; RUN: opt -passes=slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck %s

	declare i64 @may_inf_loop_ro() nounwind readonly			declare i64 @may_inf_loop_ro() nounwind readonly
	declare i64 @may_inf_loop_rw() nounwind			declare i64 @may_inf_loop_rw() nounwind
	declare i64 @may_throw() willreturn			declare i64 @may_throw() willreturn

	; Base case with no interesting control dependencies			; Base case with no interesting control dependencies
	define void @test_no_control(ptr %a, ptr %b, ptr %c) {			define void @test_no_control(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test_no_control(			; CHECK-LABEL: @test_no_control(
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%ca2 = getelementptr i64, ptr %c, i32 1			%ca2 = getelementptr i64, ptr %c, i32 1
	%c2 = load i64, ptr %ca2			%c2 = load i64, ptr %ca2
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test1(ptr %a, ptr %b, ptr %c) {			define void @test1(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test1(			; CHECK-LABEL: @test1(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test2(ptr %a, ptr %b, ptr %c) {			define void @test2(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test2(			; CHECK-LABEL: @test2(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()

	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test3(ptr %a, ptr %b, ptr %c) {			define void @test3(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test3(			; CHECK-LABEL: @test3(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1

	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test4(ptr %a, ptr %b, ptr %c) {			define void @test4(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test4(			; CHECK-LABEL: @test4(
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1

	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test5(ptr %a, ptr %b, ptr %c) {			define void @test5(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test5(			; CHECK-LABEL: @test5(
	; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4			; CHECK-NEXT: [[C1:%.]] = load i64, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2
	%c2 = call i64 @may_inf_loop_ro()			%c2 = call i64 @may_inf_loop_ro()
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	define void @test6(ptr %a, ptr %b, ptr %c) {			define void @test6(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test6(			; CHECK-LABEL: @test6(
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[TMP5:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]]			; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	call i64 @may_inf_loop_ro()			call i64 @may_inf_loop_ro()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	Show All 16 Lines
	; previously exist.			; previously exist.
	define void @test7(ptr %a, ptr %b, ptr %c) {			define void @test7(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test7(			; CHECK-LABEL: @test7(
	; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1			; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4			; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4
	; CHECK-NEXT: store i64 0, ptr [[A]], align 4			; CHECK-NEXT: store i64 0, ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	store i64 0, ptr %a			store i64 0, ptr %a
	call i64 @may_inf_loop_ro()			call i64 @may_inf_loop_ro()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	Show All 12 Lines
	; Same as test7, but with a throwing call			; Same as test7, but with a throwing call
	define void @test8(ptr %a, ptr %b, ptr %c) {			define void @test8(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test8(			; CHECK-LABEL: @test8(
	; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1			; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4			; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4
	; CHECK-NEXT: store i64 0, ptr [[A]], align 4			; CHECK-NEXT: store i64 0, ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	store i64 0, ptr %a			store i64 0, ptr %a
	call i64 @may_throw() readonly			call i64 @may_throw() readonly
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	Show All 12 Lines
	; Same as test8, but with a readwrite maythrow call			; Same as test8, but with a readwrite maythrow call
	define void @test9(ptr %a, ptr %b, ptr %c) {			define void @test9(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test9(			; CHECK-LABEL: @test9(
	; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1			; CHECK-NEXT: [[A2:%.]] = getelementptr i64, ptr [[A:%.]], i32 1
	; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4			; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A]], align 4
	; CHECK-NEXT: store i64 0, ptr [[A]], align 4			; CHECK-NEXT: store i64 0, ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw()
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	store i64 0, ptr %a			store i64 0, ptr %a
	call i64 @may_throw()			call i64 @may_throw()
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	%ca2 = getelementptr i64, ptr %c, i32 1			%ca2 = getelementptr i64, ptr %c, i32 1
	%c2 = load i64, ptr %ca2			%c2 = load i64, ptr %ca2
	%add1 = add i64 %v1, %c1			%add1 = add i64 %v1, %c1
	%add2 = add i64 %v2, %c2			%add2 = add i64 %v2, %c2

	store i64 %add1, ptr %b			store i64 %add1, ptr %b
	%b2 = getelementptr i64, ptr %b, i32 1			%b2 = getelementptr i64, ptr %b, i32 1
	store i64 %add2, ptr %b2			store i64 %add2, ptr %b2
	ret void			ret void
	}			}

	; A variant of test7 which shows the same problem with a non-load instruction			; A variant of test7 which shows the same problem with a non-load instruction
	define void @test10(ptr %a, ptr %b, ptr %c) {			define void @test10(ptr %a, ptr %b, ptr %c) {
	; CHECK-LABEL: @test10(			; CHECK-LABEL: @test10(
	; CHECK-NEXT: [[V1:%.]] = load i64, ptr [[A:%.]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load <2 x i64>, ptr [[A:%.]], align 4
	; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
	; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4			; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[TMP2]]
	; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[V1]]
	; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4			; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP3:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[V2]]			; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[TMP4]]
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0			; CHECK-NEXT: [[TMP5:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1			; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[U2]], i32 1
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4			; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP5]]
				; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%v1 = load i64, ptr %a			%v1 = load i64, ptr %a
	%a2 = getelementptr i64, ptr %a, i32 1			%a2 = getelementptr i64, ptr %a, i32 1
	%v2 = load i64, ptr %a2			%v2 = load i64, ptr %a2

	%u1 = udiv i64 200, %v1			%u1 = udiv i64 200, %v1
	store i64 %u1, ptr %a			store i64 %u1, ptr %a
	Show All 15 Lines
	; Variant of test10 block invariant operands to the udivs			; Variant of test10 block invariant operands to the udivs
	; FIXME: This is wrong, we're hoisting a faulting udiv above an infinite loop.			; FIXME: This is wrong, we're hoisting a faulting udiv above an infinite loop.
	define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {			define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {
	; CHECK-LABEL: @test11(			; CHECK-LABEL: @test11(
	; CHECK-NEXT: [[U1:%.]] = udiv i64 200, [[X:%.]]			; CHECK-NEXT: [[U1:%.]] = udiv i64 200, [[X:%.]]
	; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4			; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4
	; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()			; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
	; CHECK-NEXT: [[U2:%.]] = udiv i64 200, [[Y:%.]]			; CHECK-NEXT: [[U2:%.]] = udiv i64 200, [[Y:%.]]
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, ptr [[C:%.]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load <2 x i64>, ptr [[C:%.]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B]], align 4			; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%u1 = udiv i64 200, %x			%u1 = udiv i64 200, %x
	store i64 %u1, ptr %b			store i64 %u1, ptr %b
	call i64 @may_inf_loop_ro()			call i64 @may_inf_loop_ro()
	%u2 = udiv i64 200, %y			%u2 = udiv i64 200, %y

	%c1 = load i64, ptr %c			%c1 = load i64, ptr %c
	Show All 10 Lines

llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 \| FileCheck %s			; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 \| FileCheck %s

	define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {			define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
	; CHECK-LABEL: @test(			; CHECK-LABEL: @test(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = insertelement <4 x i64> poison, i64 [[P0:%.]], i32 0			; CHECK-NEXT: [[TMP0:%.]] = insertelement <4 x i64> poison, i64 [[P0:%.]], i32 0
	; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.]], i32 1			; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.]], i32 1
	; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.]], i32 2			; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.]], i32 2
	; CHECK-NEXT: [[TMP3:%.]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.]], i32 3			; CHECK-NEXT: [[TMP3:%.]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.]], i32 3
	; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]			; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
	; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
	; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]			; CHECK-NEXT: [[D0:%.*]] = sdiv i64 [[P0]], [[P0]]
	; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]			; CHECK-NEXT: [[D1:%.*]] = sdiv i64 [[P1]], [[P1]]
	; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]			; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]]
	; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>			; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]]
	; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>			; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[D0]], i32 0
	; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[D1]], i32 1
	; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>			; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[D2]], i32 2
	; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]			; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[D3]], i32 3
	; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>			; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i64> [[TMP5]], [[TMP9]]
				; CHECK-NEXT: [[TMP11:%.*]] = shl <4 x i64> [[TMP4]], [[TMP10]]
				; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
				; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[D0]], i32 2
				; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
				; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[D1]], i32 2
				; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP13]], [[TMP15]]
				; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
	; CHECK-NEXT: br label [[BB:%.*]]			; CHECK-NEXT: br label [[BB:%.*]]
	; CHECK: bb:			; CHECK: bb:
	; CHECK-NEXT: [[TMP15:%.]] = phi <4 x i32> [ [[TMP16:%.]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]			; CHECK-NEXT: [[TMP18:%.]] = phi <4 x i32> [ [[TMP19:%.]], [[BB]] ], [ [[TMP17]], [[ENTRY:%.*]] ]
	; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>			; CHECK-NEXT: [[TMP19]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
	; CHECK-NEXT: br label [[BB]]			; CHECK-NEXT: br label [[BB]]
	;			;
	entry:			entry:
	%a0 = add i64 %p0, %p0			%a0 = add i64 %p0, %p0
	%a1 = add i64 %p1, %p1			%a1 = add i64 %p1, %p1
	%a2 = add i64 %p2, %p2			%a2 = add i64 %p2, %p2
	%a3 = add i64 %p3, %p3			%a3 = add i64 %p3, %p3
	%m0 = mul i64 %p0, %p0			%m0 = mul i64 %p0, %p0
	Show All 36 Lines

llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=slp-vectorizer -S \| FileCheck %s			; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=slp-vectorizer -S \| FileCheck %s
				RKSimonUnsubmitted Not Done Reply Inline Actions do we have a fveclib that we can add test coverage for that will vectorize sin calls? RKSimon: do we have a fveclib that we can add test coverage for that will vectorize sin calls?
				ABataevAuthorUnsubmitted Done Reply Inline Actions There is AArch64/accelerate-vector-functions-inseltpoison.ll for AArch64, will add SVML for x86 ABataev: There is AArch64/accelerate-vector-functions-inseltpoison.ll for AArch64, will add SVML for x86

	@src = common global [8 x double] zeroinitializer, align 64			@src = common global [8 x double] zeroinitializer, align 64
	@dst = common global [8 x double] zeroinitializer, align 64			@dst = common global [8 x double] zeroinitializer, align 64

	declare double @llvm.sqrt.f64(double)			declare double @llvm.sqrt.f64(double)
	declare double @llvm.sin.f64(double)			declare double @llvm.sin.f64(double)

	define void @test() {			define void @test() {
	; CHECK-LABEL: @test(			; CHECK-LABEL: @test(
	; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8
	; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
	; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8			; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
	; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8			; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
	; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
	; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
	; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8			; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
	; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8			; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
	; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0			; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]])
	; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1			; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]])
	; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])			; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0			; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
	; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1			; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8
	; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]])			; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
	; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0			; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
	; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1			; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
	; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])			; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
	; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0			; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A4]], i32 1
	; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1			; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
	; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])			; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
	; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]			; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A5]], i32 1
	; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]			; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]])
	; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0
	; CHECK-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8			; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[SIN3]], i32 1
				; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
				; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[SIN0]], i32 0
				; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[SIN2]], i32 1
				; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP6]]
				; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP12]]
				; CHECK-NEXT: store <2 x double> [[TMP13]], ptr @dst, align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%a0 = load double, ptr @src, align 8			%a0 = load double, ptr @src, align 8
	%a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8			%a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
	%a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8			%a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
	%a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8			%a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
	%a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8			%a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
	%a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8			%a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
	Show All 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Introduce isLegalVectorOp to check if the vector instruction is going to be scalarized.
Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 538224

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll

llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll

llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Introduce isLegalVectorOp to check if the vector instruction is going to be scalarized.Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 538224

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll

llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll

llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll

llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll

llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll

[SLP]Introduce isLegalVectorOp to check if the vector instruction is going to be scalarized.
Needs ReviewPublic