Diff 454772

llvm/include/llvm/Analysis/VectorUtils.h

	Show First 20 Lines • Show All 119 Lines • ▼ Show 20 Lines
	};			};

	/// Holds the VFShape for a specific scalar to vector function mapping.			/// Holds the VFShape for a specific scalar to vector function mapping.
	struct VFInfo {			struct VFInfo {
	VFShape Shape; /// Classification of the vector function.			VFShape Shape; /// Classification of the vector function.
	std::string ScalarName; /// Scalar Function Name.			std::string ScalarName; /// Scalar Function Name.
	std::string VectorName; /// Vector Function Name associated to this VFInfo.			std::string VectorName; /// Vector Function Name associated to this VFInfo.
	VFISAKind ISA; /// Instruction Set Architecture.			VFISAKind ISA; /// Instruction Set Architecture.

				unsigned getParamIndexForMask() const {
				david-armUnsubmitted Done Reply Inline Actions Might be worth adding `///` comments here, since the others all have them? david-arm: Might be worth adding `///` comments here, since the others all have them?
				auto MaskPos = getParamIndexForOptionalMask();
				if (MaskPos)
				return *MaskPos;

				llvm_unreachable("Requested paramater index of non-existent mask!");
				david-armUnsubmitted Done Reply Inline Actions I think this will be compiled away in a release build, right? So really it's just a non-release wrapper around `getParamIndexForOptionalMask`. Given it's only called in one place is it worth just making `getParamIndexForOptionalMask` public instead and putting an assert in LoopVectorize.cpp that a mask exists? david-arm: I think this will be compiled away in a release build, right? So really it's just a non-release…
				}

				bool isMasked() const { return getParamIndexForOptionalMask().has_value(); }
				david-armUnsubmitted Not Done Reply Inline Actions This function is never called - can it be deleted? david-arm: This function is never called - can it be deleted?
				huntergrAuthorUnsubmitted Done Reply Inline Actions There's now a use for it in the assert when building a recipe. (This was used in the original patch, but was left in when splitting into 3 parts). huntergr: There's now a use for it in the assert when building a recipe. (This was used in the original…

				private:
				Optional<unsigned> getParamIndexForOptionalMask() const {
				unsigned ParamCount = Shape.Parameters.size();
				for (unsigned i = 0; i < ParamCount; ++i)
				if (Shape.Parameters[i].ParamKind == VFParamKind::GlobalPredicate)
				return i;

				return None;
				}
	};			};

	namespace VFABI {			namespace VFABI {
	/// LLVM Internal VFABI ISA token for vector functions.			/// LLVM Internal VFABI ISA token for vector functions.
	static constexpr char const *_LLVM_ = "_LLVM_";			static constexpr char const *_LLVM_ = "_LLVM_";
	/// Prefix for internal name redirection for vector function that			/// Prefix for internal name redirection for vector function that
	/// tells the compiler to scalarize the call using the scalar name			/// tells the compiler to scalarize the call using the scalar name
	/// of the function. For example, a mangled name like			/// of the function. For example, a mangled name like
	▲ Show 20 Lines • Show All 837 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Show First 20 Lines • Show All 1,101 Lines • ▼ Show 20 Lines	for (Instruction &I : *BB) {
}		}

// Do not let llvm.experimental.noalias.scope.decl block the vectorization.		// Do not let llvm.experimental.noalias.scope.decl block the vectorization.
// TODO: there might be cases that it should block the vectorization. Let's		// TODO: there might be cases that it should block the vectorization. Let's
// ignore those for now.		// ignore those for now.
if (isa<NoAliasScopeDeclInst>(&I))		if (isa<NoAliasScopeDeclInst>(&I))
continue;		continue;

		if (CallInst *CI = dyn_cast<CallInst>(&I)) {
		// Check whether we have at least one masked vector version of a scalar
		// function.
		bool HasMaskedVersion = false;

		auto Mappings = VFDatabase::getMappings(*CI);
		for (VFInfo Info : Mappings)
		HasMaskedVersion \|= Info.isMasked();

		if (HasMaskedVersion) {
		MaskedOp.insert(CI);
		continue;
		}
		}

// Loads are handled via masking (or speculated if safe to do so.)		// Loads are handled via masking (or speculated if safe to do so.)
if (auto *LI = dyn_cast<LoadInst>(&I)) {		if (auto *LI = dyn_cast<LoadInst>(&I)) {
if (!SafePtrs.count(LI->getPointerOperand()))		if (!SafePtrs.count(LI->getPointerOperand()))
MaskedOp.insert(LI);		MaskedOp.insert(LI);
continue;		continue;
}		}

// Predicated store requires some form of masking:		// Predicated store requires some form of masking:
▲ Show 20 Lines • Show All 301 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 469 Lines • ▼ Show 20 Lines	public:
/// loop and the start value for the canonical induction, if it is != 0. The		/// loop and the start value for the canonical induction, if it is != 0. The
/// latter is the case when vectorizing the epilogue loop. In the case of		/// latter is the case when vectorizing the epilogue loop. In the case of
/// epilogue vectorization, this function is overriden to handle the more		/// epilogue vectorization, this function is overriden to handle the more
/// complex control flow around the loops.		/// complex control flow around the loops.
virtual std::pair<BasicBlock , Value > createVectorizedLoopSkeleton();		virtual std::pair<BasicBlock , Value > createVectorizedLoopSkeleton();

/// Widen a single call instruction within the innermost loop.		/// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,		void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
VPTransformState &State);		VPTransformState &State, bool MaskAvailable);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.		/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);		void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);

// Return true if any runtime check is added.		// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }		bool areSafetyChecksAdded() { return AddedSafetyChecks; }

/// A type for vectorized values in the new loop. Each value from the		/// A type for vectorized values in the new loop. Each value from the
▲ Show 20 Lines • Show All 1,041 Lines • ▼ Show 20 Lines	public:
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;		InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;

/// Estimate cost of a call instruction CI if it were vectorized with factor		/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead		/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be		/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -		/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.		/// i.e. either vector version isn't available, or is too expensive.
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,		InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
		bool NeedsMask,
bool &NeedToScalarize) const;		bool &NeedToScalarize) const;

/// Returns true if the per-lane cost of VectorizationFactor A is lower than		/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.		/// that of B.
bool isMoreProfitable(const VectorizationFactor &A,		bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B) const;		const VectorizationFactor &B) const;

/// Invalidates decisions already taken by the cost model.		/// Invalidates decisions already taken by the cost model.
▲ Show 20 Lines • Show All 1,852 Lines • ▼ Show 20 Lines	for (Instruction &In : llvm::make_early_inc_range(*BB)) {
}		}

CSEMap[&In] = &In;		CSEMap[&In] = &In;
}		}
}		}

InstructionCost		InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,		LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
		bool NeedsMask,
bool &NeedToScalarize) const {		bool &NeedToScalarize) const {
Function *F = CI->getCalledFunction();		Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();		Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;		SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->args())		for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());		ScalarTys.push_back(ArgOp->getType());

// Estimate cost of scalarized vector call. The source operands are assumed		// Estimate cost of scalarized vector call. The source operands are assumed
Show All 15 Lines	LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);		InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);

InstructionCost Cost =		InstructionCost Cost =
ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;		ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

// If we can't emit a vector call for this function, then the currently found		// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.		// cost is the cost we need to return.
NeedToScalarize = true;		NeedToScalarize = true;
VFShape Shape = VFShape::get(CI, VF, false /HasGlobalPred*/);		VFShape Shape = VFShape::get(*CI, VF, NeedsMask);
Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);		Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);
		// If we want an unmasked vector function but can't find one matching the VF,
		// and the target supports an active lane mask, maybe we can find vector
		// function that does use a mask and synthesize an all-true mask.
		if (!VecFunc && !NeedsMask &&
		david-armUnsubmitted Not Done Reply Inline Actions This looks a little strange to me. In my mind, the ability to emit an active lane mask based on two integer inputs is orthogonal to how cheap it is to broadcast a true bit across a predicate. For example, an architecture may cheaply support the latter, but not the former. Maybe X86 is such an example? Can we not just let the mask cost decide the behaviour? That way you can simplify this to just if (!VecFunc) { ... david-arm: This looks a little strange to me. In my mind, the ability to emit an active lane mask based on…
		huntergrAuthorUnsubmitted Done Reply Inline Actions My thinking was to treat the capability to emit an active lane mask as a proxy for being able to use masks at all, but perhaps that's a little too conservative. I don't know if we should add a proper TTI interface to represent that capability, or just rely on the VFDatabase only having entries which the target is capable of supporting. In any case, I've removed that check for now. huntergr: My thinking was to treat the capability to emit an active lane mask as a proxy for being able…
		TTI.emitGetActiveLaneMask() != PredicationStyle::None) {
		Shape = VFShape::get(CI, VF, /HasGlobalPred=*/true);
		VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
		}

if (!TLI \|\| CI->isNoBuiltin() \|\| !VecFunc)		if (!TLI \|\| CI->isNoBuiltin() \|\| !VecFunc)
return Cost;		return Cost;

// If the corresponding vector cost is cheaper, return its cost.		// If the corresponding vector cost is cheaper, return its cost.
InstructionCost VectorCallCost =		InstructionCost VectorCallCost =
TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);		TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {		if (VectorCallCost < Cost) {
NeedToScalarize = false;		NeedToScalarize = false;
Cost = VectorCallCost;		Cost = VectorCallCost;
		david-armUnsubmitted Done Reply Inline Actions Do we really need both the `Variant` and the `NeedToScalarize` parameter? It looks naively that setting `Variant = VecFunc` is synonymous with `NeedToScalarize = false`. I haven't looked into this in detail so I could be wrong, but it might make more sense to remove the `NeedToScalarize` in favour of setting `Variant`? david-arm: Do we really need both the `Variant` and the `NeedToScalarize` parameter? It looks naively that…
}		}
return Cost;		return Cost;
}		}

static Type MaybeVectorizeType(Type Elt, ElementCount VF) {		static Type MaybeVectorizeType(Type Elt, ElementCount VF) {
if (VF.isScalar() \|\| (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))		if (VF.isScalar() \|\| (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
return Elt;		return Elt;
return VectorType::get(Elt, VF);		return VectorType::get(Elt, VF);
▲ Show 20 Lines • Show All 695 Lines • ▼ Show 20 Lines

bool InnerLoopVectorizer::useOrderedReductions(		bool InnerLoopVectorizer::useOrderedReductions(
const RecurrenceDescriptor &RdxDesc) {		const RecurrenceDescriptor &RdxDesc) {
return Cost->useOrderedReductions(RdxDesc);		return Cost->useOrderedReductions(RdxDesc);
}		}

void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,		void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
VPUser &ArgOperands,		VPUser &ArgOperands,
VPTransformState &State) {		VPTransformState &State,
		bool MaskAvailable) {
assert(!isa<DbgInfoIntrinsic>(CI) &&		assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");		"DbgInfoIntrinsic should have been dropped during VPlan construction");
State.setDebugLocFromInst(&CI);		State.setDebugLocFromInst(&CI);

SmallVector<Type *, 4> Tys;
for (Value *ArgOperand : CI.args())
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));

Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);		Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);

// The flag shows whether we use Intrinsic or a usual Call for vectorized		// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.		// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?		// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;		bool NeedToScalarize = false;
InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);		InstructionCost CallCost =
		Cost->getVectorCallCost(&CI, VF, MaskAvailable, NeedToScalarize);
InstructionCost IntrinsicCost =		InstructionCost IntrinsicCost =
ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;		ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;		bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic \|\| !NeedToScalarize) &&		assert((UseVectorIntrinsic \|\| !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");		"Instruction should be scalarized elsewhere.");
assert((IntrinsicCost.isValid() \|\| CallCost.isValid()) &&		assert((IntrinsicCost.isValid() \|\| CallCost.isValid()) &&
"Either the intrinsic cost or vector call cost must be valid");		"Either the intrinsic cost or vector call cost must be valid");

		// If we added a mask operand in the recipe, extract it so that we can
		// insert it in the right position for the vectorized call. The mask isn't
		// guaranteed to be the last argument.
		VPValue *VPMask = nullptr;
		if (MaskAvailable)
		VPMask = ArgOperands.removeAndReturnLastOperand();

for (unsigned Part = 0; Part < UF; ++Part) {		for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Type *, 2> TysForDecl = {CI.getType()};		SmallVector<Type *, 2> TysForDecl = {CI.getType()};
SmallVector<Value *, 4> Args;		SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(ArgOperands.operands())) {		for (const auto &I : enumerate(ArgOperands.operands())) {
// Some intrinsics have a scalar argument - don't replace it with a		// Some intrinsics have a scalar argument - don't replace it with a
// vector.		// vector.
Value *Arg;		Value *Arg;
if (!UseVectorIntrinsic \|\|		if (!UseVectorIntrinsic \|\|
!isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))		!isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
Arg = State.get(I.value(), Part);		Arg = State.get(I.value(), Part);
else		else
Arg = State.get(I.value(), VPIteration(0, 0));		Arg = State.get(I.value(), VPIteration(0, 0));
if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))		if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
TysForDecl.push_back(Arg->getType());		TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);		Args.push_back(Arg);
}		}

Function *VectorF;		Function *VectorF;
		bool VectorFTakesMask = false;
		unsigned VectorFMaskPos = 0;

if (UseVectorIntrinsic) {		if (UseVectorIntrinsic) {
// Use vector version of the intrinsic.		// Use vector version of the intrinsic.
if (VF.isVector())		if (VF.isVector())
TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);		TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
Module *M = State.Builder.GetInsertBlock()->getModule();		Module *M = State.Builder.GetInsertBlock()->getModule();
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);		VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");		assert(VectorF && "Can't retrieve vector intrinsic.");
} else {		} else {
// Use vector version of the function call.		// Use vector version of the function call.
const VFShape Shape = VFShape::get(CI, VF, false /HasGlobalPred/);		VFShape Shape = VFShape::get(CI, VF, MaskAvailable);

		VectorF = VFDatabase(CI).getVectorizedFunction(Shape);

		if (!VectorF && !MaskAvailable &&
		TTI->emitGetActiveLaneMask() != PredicationStyle::None) {
		Shape = VFShape::get(CI, VF, /HasGlobalPred=/true);
		VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
		}
#ifndef NDEBUG		#ifndef NDEBUG
assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&		assert(VectorF != nullptr && "Can't create vector function.");
"Can't create vector function.");
#endif		#endif
VectorF = VFDatabase(CI).getVectorizedFunction(Shape);		// Check the VFInfo for masking details
		for (VFInfo Info : VFDatabase(CI).getMappings(CI)) {
		if (Info.Shape == Shape) {
		VectorFTakesMask = Info.isMasked();
		if (VectorFTakesMask)
		VectorFMaskPos = Info.getParamIndexForMask();
		break;
		}
		}
		}

		assert((!MaskAvailable \|\| VectorFTakesMask) &&
		"Mask supplied for function with no mask argument");

		if (VectorFTakesMask) {
		Value *Mask = nullptr;
		if (VPMask)
		Mask = State.get(VPMask, Part);
		else
		Mask = ConstantInt::getTrue(VectorType::get(
		IntegerType::getInt1Ty(VectorF->getFunctionType()->getContext()),
		VF));
		Args.insert(Args.begin() + VectorFMaskPos, Mask);
}		}

SmallVector<OperandBundleDef, 1> OpBundles;		SmallVector<OperandBundleDef, 1> OpBundles;
CI.getOperandBundlesAsDefs(OpBundles);		CI.getOperandBundlesAsDefs(OpBundles);
CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);		CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

if (isa<FPMathOperator>(V))		if (isa<FPMathOperator>(V))
V->copyFastMathFlags(&CI);		V->copyFastMathFlags(&CI);

State.set(Def, V, Part);		State.set(Def, V, Part);
State.addMetadata(V, &CI);		State.addMetadata(V, &CI);
}		}
}		}

void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {		void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this		// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does		// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.		// this check. Collecting Scalars for VF=1 does not make any sense.
assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&		assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
▲ Show 20 Lines • Show All 3,046 Lines • ▼ Show 20 Lines	case Instruction::FPTrunc: {
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);		return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}		}
case Instruction::Call: {		case Instruction::Call: {
if (RecurrenceDescriptor::isFMulAddIntrinsic(I))		if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))		if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
return *RedCost;		return *RedCost;
bool NeedToScalarize;		bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);		CallInst *CI = cast<CallInst>(I);
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);		InstructionCost CallCost =
		getVectorCallCost(CI, VF, Legal->isMaskRequired(CI), NeedToScalarize);
if (getVectorIntrinsicIDForCall(CI, TLI)) {		if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);		InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);		return std::min(CallCost, IntrinsicCost);
}		}
return CallCost;		return CallCost;
}		}
case Instruction::ExtractValue:		case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);		return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
▲ Show 20 Lines • Show All 969 Lines • ▼ Show 20 Lines	for (unsigned In = 0; In < NumIncoming; In++) {
if (EdgeMask)		if (EdgeMask)
OperandsWithMask.push_back(EdgeMask);		OperandsWithMask.push_back(EdgeMask);
}		}
return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));		return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
}		}

VPWidenCallRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,		VPWidenCallRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,
ArrayRef<VPValue *> Operands,		ArrayRef<VPValue *> Operands,
VFRange &Range) const {		VFRange &Range,
		VPlanPtr &Plan) {

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(		bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {		[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);		return CM.isScalarWithPredication(CI, VF);
},		},
Range);		Range);

if (IsPredicated)		if (IsPredicated)
return nullptr;		return nullptr;

		VPValue *Mask = nullptr;
		if (Legal->isMaskRequired(CI))
		Mask = createBlockInMask(CI->getParent(), Plan);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);		Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|		if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|		ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
ID == Intrinsic::pseudoprobe \|\|		ID == Intrinsic::pseudoprobe \|\|
ID == Intrinsic::experimental_noalias_scope_decl))		ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;		return nullptr;

auto willWiden = [&](ElementCount VF) -> bool {		auto willWiden = [&](ElementCount VF) -> bool {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);		Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// The following case may be scalarized depending on the VF.		// The following case may be scalarized depending on the VF.
// The flag shows whether we use Intrinsic or a usual Call for vectorized		// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.		// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?		// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;		bool NeedToScalarize = false;
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);		InstructionCost CallCost = CM.getVectorCallCost(
		CI, VF, Legal->isMaskRequired(CI), NeedToScalarize);
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;		InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;		bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
return UseVectorIntrinsic \|\| !NeedToScalarize;		return UseVectorIntrinsic \|\| !NeedToScalarize;
};		};

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))		if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
		david-armUnsubmitted Not Done Reply Inline Actions Doesn't this mean we may end up picking the least optimal VF? For example, if there are v2i32 and v4i32 masked variants we'll only ever pick the v2i32, i.e. the lowest VF? david-arm: Doesn't this mean we may end up picking the least optimal VF? For example, if there are v2i32…
		huntergrAuthorUnsubmitted Done Reply Inline Actions No. Since we now store the pointer to the Function in the recipe, we need to force vplan to generate different plans for each VF that has a vector variant available. See the vplan checks for 'test_v2_v4m' in synthesize-mask-for-call.ll -- there are separate VF=2 and VF=4 plans, with a widened call to different functions. huntergr: No. Since we now store the pointer to the Function in the recipe, we need to force vplan to…
		david-armUnsubmitted Done Reply Inline Actions OK, can you add some comments here explaining why you are forcing the creation of a new vplan for every subsequent VF after discovering a vector variant? david-arm: OK, can you add some comments here explaining why you are forcing the creation of a new vplan…
return nullptr;		return nullptr;

ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());		ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));		return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), Mask);
		david-armUnsubmitted Done Reply Inline Actions nit: Could you change this to VFShape Shape = VFShape::get(CI, VariantVF, /HasGlobalPred=/true); david-arm:* nit: Could you change this to VFShape Shape = VFShape::get(*CI, VariantVF…
}		}

bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {		bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&		assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
!isa<StoreInst>(I) && "Instruction should have been handled earlier");		!isa<StoreInst>(I) && "Instruction should have been handled earlier");
// Instruction should be widened, unless it is scalar after vectorization,		// Instruction should be widened, unless it is scalar after vectorization,
// scalarization is profitable or it is predicated.		// scalarization is profitable or it is predicated.
auto WillScalarize = [this, I](ElementCount VF) -> bool {		auto WillScalarize = [this, I](ElementCount VF) -> bool {
▲ Show 20 Lines • Show All 242 Lines • ▼ Show 20 Lines	if (isa<TruncInst>(Instr) &&
return toVPRecipeResult(Recipe);		return toVPRecipeResult(Recipe);

// All widen recipes below deal only with VF > 1.		// All widen recipes below deal only with VF > 1.
if (LoopVectorizationPlanner::getDecisionAndClampRange(		if (LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return VF.isScalar(); }, Range))		[&](ElementCount VF) { return VF.isScalar(); }, Range))
return nullptr;		return nullptr;

if (auto *CI = dyn_cast<CallInst>(Instr))		if (auto *CI = dyn_cast<CallInst>(Instr))
return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));		return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));

if (isa<LoadInst>(Instr) \|\| isa<StoreInst>(Instr))		if (isa<LoadInst>(Instr) \|\| isa<StoreInst>(Instr))
return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));		return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));

if (!shouldWiden(Instr, Range))		if (!shouldWiden(Instr, Range))
return nullptr;		return nullptr;

if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))		if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
▲ Show 20 Lines • Show All 692 Lines • ▼ Show 20 Lines	for (unsigned i = 0; i < IG->getFactor(); ++i) {
}		}
++OpIdx;		++OpIdx;
}		}
}		}
#endif		#endif

void VPWidenCallRecipe::execute(VPTransformState &State) {		void VPWidenCallRecipe::execute(VPTransformState &State) {
State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,		State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
*this, State);		*this, State, Mask);
}		}

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {		void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");		assert(!State.Instance && "Int or FP induction being replicated.");

Value *Start = getStartValue()->getLiveInIRValue();		Value *Start = getStartValue()->getLiveInIRValue();
const InductionDescriptor &ID = getInductionDescriptor();		const InductionDescriptor &ID = getInductionDescriptor();
TruncInst *Trunc = getTruncInst();		TruncInst *Trunc = getTruncInst();
▲ Show 20 Lines • Show All 1,337 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	class VPRecipeBuilder {
/// performs full if-conversion.		/// performs full if-conversion.
VPRecipeOrVPValueTy tryToBlend(PHINode Phi, ArrayRef<VPValue > Operands,		VPRecipeOrVPValueTy tryToBlend(PHINode Phi, ArrayRef<VPValue > Operands,
VPlanPtr &Plan);		VPlanPtr &Plan);

/// Handle call instructions. If \p CI can be widened for \p Range.Start,		/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same		/// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
/// decision from \p Range.Start to \p Range.End.		/// decision from \p Range.Start to \p Range.End.
VPWidenCallRecipe tryToWidenCall(CallInst CI, ArrayRef<VPValue *> Operands,		VPWidenCallRecipe tryToWidenCall(CallInst CI, ArrayRef<VPValue *> Operands,
VFRange &Range) const;		VFRange &Range, VPlanPtr &Plan);

/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe		/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
/// if it can. The function should only be called if the cost-model indicates		/// if it can. The function should only be called if the cost-model indicates
/// that widening should be performed.		/// that widening should be performed.
VPWidenRecipe tryToWiden(Instruction I, ArrayRef<VPValue *> Operands) const;		VPWidenRecipe tryToWiden(Instruction I, ArrayRef<VPValue *> Operands) const;

/// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.		/// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }		VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }
▲ Show 20 Lines • Show All 74 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlan.h

Show First 20 Lines • Show All 940 Lines • ▼ Show 20 Lines	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
/// Print the recipe.		/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,		void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;		VPSlotTracker &SlotTracker) const override;
#endif		#endif
};		};

/// A recipe for widening Call instructions.		/// A recipe for widening Call instructions.
class VPWidenCallRecipe : public VPRecipeBase, public VPValue {		class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
		bool Mask;
		david-armUnsubmitted Done Reply Inline Actions Can you add some comments explaining what this is please? For example, that there should be one recipe for every VF because the variant requires a 1:1 mapping with the VF? david-arm: Can you add some comments explaining what this is please? For example, that there should be one…

public:		public:
template <typename IterT>		template <typename IterT>
VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)		VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
		VPValue *MaskVal = nullptr)
: VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments),		: VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments),
VPValue(VPValue::VPVWidenCallSC, &I, this) {}		VPValue(VPValue::VPVWidenCallSC, &I, this), Mask(MaskVal != nullptr) {
		if (MaskVal)
		addOperand(MaskVal);
		}

~VPWidenCallRecipe() override = default;		~VPWidenCallRecipe() override = default;

/// Method to support type inquiry through isa, cast, and dyn_cast.		/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {		static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;		return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;
}		}

▲ Show 20 Lines • Show All 2,106 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlanValue.h

Show First 20 Lines • Show All 264 Lines • ▼ Show 20 Lines	void setOperand(unsigned I, VPValue *New) {
New->addUser(*this);		New->addUser(*this);
}		}

void removeLastOperand() {		void removeLastOperand() {
VPValue *Op = Operands.pop_back_val();		VPValue *Op = Operands.pop_back_val();
Op->removeUser(*this);		Op->removeUser(*this);
}		}

		VPValue *removeAndReturnLastOperand() {
		VPValue *Op = Operands.pop_back_val();
		Op->removeUser(*this);
		return Op;
		}

typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;		typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;		typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
typedef iterator_range<operand_iterator> operand_range;		typedef iterator_range<operand_iterator> operand_range;
typedef iterator_range<const_operand_iterator> const_operand_range;		typedef iterator_range<const_operand_iterator> const_operand_range;

operand_iterator op_begin() { return Operands.begin(); }		operand_iterator op_begin() { return Operands.begin(); }
const_operand_iterator op_begin() const { return Operands.begin(); }		const_operand_iterator op_begin() const { return Operands.begin(); }
operand_iterator op_end() { return Operands.end(); }		operand_iterator op_end() { return Operands.end(); }
▲ Show 20 Lines • Show All 181 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -instsimplify -S \| FileCheck %s --check-prefixes=CHECK,LV		; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -instsimplify -S \| FileCheck %s --check-prefixes=CHECK,LV
		reamesUnsubmitted Done Reply Inline Actions Please autogenerate these tests for readability. reames: Please autogenerate these tests for readability.
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -instsimplify -S \| FileCheck %s --check-prefixes=CHECK,TFALWAYS		; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -instsimplify -S \| FileCheck %s --check-prefixes=CHECK,TFALWAYS
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -instsimplify -S \| FileCheck %s --check-prefixes=CHECK,TFFALLBACK		; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -instsimplify -S \| FileCheck %s --check-prefixes=CHECK,TFFALLBACK

target triple = "aarch64-unknown-linux-gnu"		target triple = "aarch64-unknown-linux-gnu"

; A call whose argument must be widened. We check that tail folding uses the		; A call whose argument must be widened. We check that tail folding uses the
; primary mask, and that without tail folding we synthesize an all-true mask.		; primary mask, and that without tail folding we synthesize an all-true mask.
define void @test_widen(i64* noalias %a, i64* readnone %b) #4 {		define void @test_widen(i64* noalias %a, i64* readnone %b) #4 {
; CHECK-LABEL: @test_widen(		; CHECK-LABEL: @test_widen(
; LV-NOT: call <vscale x 2 x i64> @foo_vector		; LV: %[[LOAD:.+]] = load <vscale x 2 x i64>
; TFALWAYS-NOT: vector.body		; LV: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector		; TFALWAYS: %[[MASK:.+]] = phi <vscale x 2 x i1>
; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_vector		; TFALWAYS: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64
		; TFALWAYS: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MASK]])
		; TFFALLBACK: %[[MASK:.+]] = phi <vscale x 2 x i1>
		; TFFALLBACK: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64
		; TFFALLBACK: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MASK]])
; CHECK: ret void		; CHECK: ret void
entry:		entry:
br label %for.body		br label %for.body

for.body:		for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gep = getelementptr i64, i64* %b, i64 %indvars.iv		%gep = getelementptr i64, i64* %b, i64 %indvars.iv
%load = load i64, i64* %gep		%load = load i64, i64* %gep
%call = call i64 @foo(i64 %load) #1		%call = call i64 @foo(i64 %load) #1
%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv		%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
store i64 %call, i64* %arrayidx		store i64 %call, i64* %arrayidx
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024		%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body		br i1 %exitcond, label %for.cond.cleanup, label %for.body

for.cond.cleanup:		for.cond.cleanup:
ret void		ret void
}		}

; Check that a simple conditional call can be vectorized.		; Check that a simple conditional call can be vectorized.
define void @test_if_then(i64* noalias %a, i64* readnone %b) #4 {		define void @test_if_then(i64* noalias %a, i64* readnone %b) #4 {
; CHECK-LABEL: @test_if_then(		; CHECK-LABEL: @test_if_then(
; LV-NOT: call <vscale x 2 x i64> @foo_vector		; LV: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.load, <vscale x 2 x i1> %{{.+}})
; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector		; TFALWAYS: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %{{.+}})
; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_vector		; TFFALLBACK: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %{{.+}})
; CHECK: ret void		; CHECK: ret void
entry:		entry:
br label %for.body		br label %for.body

for.body:		for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]		%indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv		%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
%0 = load i64, i64* %arrayidx, align 8		%0 = load i64, i64* %arrayidx, align 8
Show All 17 Lines
}		}

; This checks the ability to handle masking of an if-then-else CFG with		; This checks the ability to handle masking of an if-then-else CFG with
; calls inside the conditional blocks. Although one of the calls has a		; calls inside the conditional blocks. Although one of the calls has a
; uniform parameter and the metadata lists a uniform variant, right now		; uniform parameter and the metadata lists a uniform variant, right now
; we just see a splat of the parameter instead. More work needed.		; we just see a splat of the parameter instead. More work needed.
define void @test_widen_if_then_else(i64* noalias %a, i64* readnone %b) #4 {		define void @test_widen_if_then_else(i64* noalias %a, i64* readnone %b) #4 {
; CHECK-LABEL: @test_widen_if_then_else		; CHECK-LABEL: @test_widen_if_then_else
; LV-NOT: call <vscale x 2 x i64> @foo_vector		; LV: %[[LOAD:.+]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* %{{[0-9]+}}
; LV-NOT: call <vscale x 2 x i64> @foo_uniform		; LV: %[[CMP:.+]] = icmp ugt <vscale x 2 x i64> %[[LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector		; LV: %[[INV:.+]] = xor <vscale x 2 x i1> %[[CMP]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_uniform		; LV: %[[UNIFORM_SPLAT:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %[[INV]])
; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_vector		; LV: %[[VECTOR:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[CMP]])
; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_uniform		; LV: %[[PPHI:.+]] = select <vscale x 2 x i1> %[[INV]], <vscale x 2 x i64> %[[UNIFORM_SPLAT]], <vscale x 2 x i64> %[[VECTOR]]
		; LV: store <vscale x 2 x i64> %[[PPHI]]
		; TFALWAYS: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* %{{.+}}, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
		; TFALWAYS: %[[CMP:.+]] = icmp ugt <vscale x 2 x i64> %[[LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
		; TFALWAYS: %[[INV:.+]] = xor <vscale x 2 x i1> %[[CMP]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
		; TFALWAYS: %[[MERGE1:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[INV]], <vscale x 2 x i1> zeroinitializer
		; TFALWAYS: %[[UNIFORM_SPLAT:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %[[MERGE1]])
		; TFALWAYS: %[[MERGE2:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[CMP]], <vscale x 2 x i1> zeroinitializer
		; TFALWAYS: %[[VECTOR:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MERGE2]])
		; TFALWAYS: %[[PPHI:.+]] = select <vscale x 2 x i1> %[[MERGE1]], <vscale x 2 x i64> %[[UNIFORM_SPLAT]], <vscale x 2 x i64> %[[VECTOR]]
		; TFALWAYS: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> %[[PPHI]]
		; TFFALLBACK: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* %{{.+}}, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
		; TFFALLBACK: %[[CMP:.+]] = icmp ugt <vscale x 2 x i64> %[[LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
		; TFFALLBACK: %[[INV:.+]] = xor <vscale x 2 x i1> %[[CMP]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
		; TFFALLBACK: %[[MERGE1:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[INV]], <vscale x 2 x i1> zeroinitializer
		; TFFALLBACK: %[[UNIFORM_SPLAT:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %[[MERGE1]])
		; TFFALLBACK: %[[MERGE2:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[CMP]], <vscale x 2 x i1> zeroinitializer
		; TFFALLBACK: %[[VECTOR:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MERGE2]])
		; TFFALLBACK: %[[PPHI:.+]] = select <vscale x 2 x i1> %[[MERGE1]], <vscale x 2 x i64> %[[UNIFORM_SPLAT]], <vscale x 2 x i64> %[[VECTOR]]
		; TFFALLBACK: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> %[[PPHI]]
; CHECK: ret void		; CHECK: ret void
entry:		entry:
br label %for.body		br label %for.body

for.body:		for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]		%indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv		%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
%0 = load i64, i64* %arrayidx, align 8		%0 = load i64, i64* %arrayidx, align 8
Show All 20 Lines	for.cond.cleanup:
ret void		ret void
}		}

; A call whose argument must be widened, where the vector variant does not have		; A call whose argument must be widened, where the vector variant does not have
; a mask. Forcing tail folding results in no vectorized call, whereas an		; a mask. Forcing tail folding results in no vectorized call, whereas an
; unpredicated body with scalar tail can use the unmasked variant.		; unpredicated body with scalar tail can use the unmasked variant.
define void @test_widen_nomask(i64* noalias %a, i64* readnone %b) #4 {		define void @test_widen_nomask(i64* noalias %a, i64* readnone %b) #4 {
; CHECK-LABEL: @test_widen_nomask(		; CHECK-LABEL: @test_widen_nomask(
; LV: call <vscale x 2 x i64> @foo_vector_nomask		; LV: call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> %wide.load)
; TFALWAYS-NOT: vector.body
; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector_nomask		; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector_nomask
; TFFALLBACK: call <vscale x 2 x i64> @foo_vector_nomask		; TFFALLBACK: call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> %wide.load)
; CHECK: ret void		; CHECK: ret void
entry:		entry:
br label %for.body		br label %for.body

for.body:		for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gep = getelementptr i64, i64* %b, i64 %indvars.iv		%gep = getelementptr i64, i64* %b, i64 %indvars.iv
%load = load i64, i64* %gep		%load = load i64, i64* %gep
%call = call i64 @foo(i64 %load) #2		%call = call i64 @foo(i64 %load) #2
%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv		%arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
store i64 %call, i64* %arrayidx		store i64 %call, i64* %arrayidx
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024		%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body		br i1 %exitcond, label %for.cond.cleanup, label %for.body

for.cond.cleanup:		for.cond.cleanup:
ret void		ret void
}		}

; If both masked and unmasked options are present, we expect to see tail folding		; If both masked and unmasked options are present, we expect to see tail folding
; use the masked version and unpredicated body with scalar tail use the unmasked		; use the masked version and unpredicated body with scalar tail use the unmasked
; version.		; version.
define void @test_widen_optmask(i64* noalias %a, i64* readnone %b) #4 {		define void @test_widen_optmask(i64* noalias %a, i64* readnone %b) #4 {
; CHECK-LABEL: @test_widen_optmask(		; CHECK-LABEL: @test_widen_optmask(
; LV: call <vscale x 2 x i64> @foo_vector_nomask		; LV: call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> %wide.load)
; TFALWAYS-NOT: vector.body		; TFALWAYS: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %active.lane.mask)
; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector		; TFFALLBACK: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %active.lane.mask)
; TFFALLBACK: call <vscale x 2 x i64> @foo_vector_nomask
; CHECK: ret void		; CHECK: ret void
entry:		entry:
br label %for.body		br label %for.body

for.body:		for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gep = getelementptr i64, i64* %b, i64 %indvars.iv		%gep = getelementptr i64, i64* %b, i64 %indvars.iv
%load = load i64, i64* %gep		%load = load i64, i64* %gep
Show All 23 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Synthesize mask operands for vector variants as needed
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 454772

llvm/include/llvm/Analysis/VectorUtils.h

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/lib/Transforms/Vectorize/VPlanValue.h

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Synthesize mask operands for vector variants as neededClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 454772

llvm/include/llvm/Analysis/VectorUtils.h

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/lib/Transforms/Vectorize/VPlanValue.h

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

[LoopVectorize] Synthesize mask operands for vector variants as needed
ClosedPublic