Diff 497308

llvm/include/llvm/Analysis/VectorUtils.h

	Show First 20 Lines • Show All 119 Lines • ▼ Show 20 Lines
	};			};

	/// Holds the VFShape for a specific scalar to vector function mapping.			/// Holds the VFShape for a specific scalar to vector function mapping.
	struct VFInfo {			struct VFInfo {
	VFShape Shape; /// Classification of the vector function.			VFShape Shape; /// Classification of the vector function.
	std::string ScalarName; /// Scalar Function Name.			std::string ScalarName; /// Scalar Function Name.
	std::string VectorName; /// Vector Function Name associated to this VFInfo.			std::string VectorName; /// Vector Function Name associated to this VFInfo.
	VFISAKind ISA; /// Instruction Set Architecture.			VFISAKind ISA; /// Instruction Set Architecture.

				/// Returns the index of the first parameter with the kind 'GlobalPredicate',
				david-armUnsubmitted Done Reply Inline Actions Might be worth adding `///` comments here, since the others all have them? david-arm: Might be worth adding `///` comments here, since the others all have them?
				/// if any exist.
				std::optional<unsigned> getParamIndexForOptionalMask() const {
				unsigned ParamCount = Shape.Parameters.size();
				for (unsigned i = 0; i < ParamCount; ++i)
				if (Shape.Parameters[i].ParamKind == VFParamKind::GlobalPredicate)
				david-armUnsubmitted Done Reply Inline Actions I think this will be compiled away in a release build, right? So really it's just a non-release wrapper around `getParamIndexForOptionalMask`. Given it's only called in one place is it worth just making `getParamIndexForOptionalMask` public instead and putting an assert in LoopVectorize.cpp that a mask exists? david-arm: I think this will be compiled away in a release build, right? So really it's just a non-release…
				return i;

				return std::nullopt;
				david-armUnsubmitted Not Done Reply Inline Actions This function is never called - can it be deleted? david-arm: This function is never called - can it be deleted?
				huntergrAuthorUnsubmitted Done Reply Inline Actions There's now a use for it in the assert when building a recipe. (This was used in the original patch, but was left in when splitting into 3 parts). huntergr: There's now a use for it in the assert when building a recipe. (This was used in the original…
				}

				/// Returns true if at least one of the operands to the vectorized function
				/// has the kind 'GlobalPredicate'.
				bool isMasked() const { return getParamIndexForOptionalMask().has_value(); }
	};			};

	namespace VFABI {			namespace VFABI {
	/// LLVM Internal VFABI ISA token for vector functions.			/// LLVM Internal VFABI ISA token for vector functions.
	static constexpr char const *_LLVM_ = "_LLVM_";			static constexpr char const *_LLVM_ = "_LLVM_";
	/// Prefix for internal name redirection for vector function that			/// Prefix for internal name redirection for vector function that
	/// tells the compiler to scalarize the call using the scalar name			/// tells the compiler to scalarize the call using the scalar name
	/// of the function. For example, a mangled name like			/// of the function. For example, a mangled name like
	▲ Show 20 Lines • Show All 851 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,587 Lines • ▼ Show 20 Lines	public:
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;		InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;

/// Estimate cost of a call instruction CI if it were vectorized with factor		/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead		/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be		/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -		/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.		/// i.e. either vector version isn't available, or is too expensive.
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,		InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize) const;		Function **Variant,
		bool *NeedsMask = nullptr) const;

/// Returns true if the per-lane cost of VectorizationFactor A is lower than		/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.		/// that of B.
bool isMoreProfitable(const VectorizationFactor &A,		bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B) const;		const VectorizationFactor &B) const;

/// Invalidates decisions already taken by the cost model.		/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {		void invalidateCostModelingDecisions() {
▲ Show 20 Lines • Show All 1,831 Lines • ▼ Show 20 Lines	if (Instruction *V = CSEMap.lookup(&In)) {
In.eraseFromParent();		In.eraseFromParent();
continue;		continue;
}		}

CSEMap[&In] = &In;		CSEMap[&In] = &In;
}		}
}		}

InstructionCost		InstructionCost LoopVectorizationCostModel::getVectorCallCost(
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,		CallInst CI, ElementCount VF, Function Variant, bool NeedsMask) const {
bool &NeedToScalarize) const {
Function *F = CI->getCalledFunction();		Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();		Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;		SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->args())		for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());		ScalarTys.push_back(ArgOp->getType());

// Estimate cost of scalarized vector call. The source operands are assumed		// Estimate cost of scalarized vector call. The source operands are assumed
// to be vectors, so we need to extract individual elements from there,		// to be vectors, so we need to extract individual elements from there,
Show All 15 Lines	InstructionCost LoopVectorizationCostModel::getVectorCallCost(
InstructionCost ScalarizationCost =		InstructionCost ScalarizationCost =
getScalarizationOverhead(CI, VF, CostKind);		getScalarizationOverhead(CI, VF, CostKind);

InstructionCost Cost =		InstructionCost Cost =
ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;		ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

// If we can't emit a vector call for this function, then the currently found		// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.		// cost is the cost we need to return.
NeedToScalarize = true;		InstructionCost MaskCost = 0;
VFShape Shape = VFShape::get(CI, VF, false /HasGlobalPred*/);		VFShape Shape = VFShape::get(CI, VF, false /HasGlobalPred*/);
Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);		Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);
		// If we want an unmasked vector function but can't find one matching the VF,
		// maybe we can find vector function that does use a mask and synthesize
		// an all-true mask.
		if (!VecFunc) {
		david-armUnsubmitted Not Done Reply Inline Actions This looks a little strange to me. In my mind, the ability to emit an active lane mask based on two integer inputs is orthogonal to how cheap it is to broadcast a true bit across a predicate. For example, an architecture may cheaply support the latter, but not the former. Maybe X86 is such an example? Can we not just let the mask cost decide the behaviour? That way you can simplify this to just if (!VecFunc) { ... david-arm: This looks a little strange to me. In my mind, the ability to emit an active lane mask based on…
		huntergrAuthorUnsubmitted Done Reply Inline Actions My thinking was to treat the capability to emit an active lane mask as a proxy for being able to use masks at all, but perhaps that's a little too conservative. I don't know if we should add a proper TTI interface to represent that capability, or just rely on the VFDatabase only having entries which the target is capable of supporting. In any case, I've removed that check for now. huntergr: My thinking was to treat the capability to emit an active lane mask as a proxy for being able…
		Shape = VFShape::get(CI, VF, /HasGlobalPred=*/true);
		VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
		// If we found one, add in the cost of creating a mask
		if (VecFunc) {
		if (NeedsMask)
		*NeedsMask = true;
		MaskCost = TTI.getShuffleCost(
		TargetTransformInfo::SK_Broadcast,
		VectorType::get(
		IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),
		VF));
		}
		}

if (!TLI \|\| CI->isNoBuiltin() \|\| !VecFunc)		if (!TLI \|\| CI->isNoBuiltin() \|\| !VecFunc)
return Cost;		return Cost;

// If the corresponding vector cost is cheaper, return its cost.		// If the corresponding vector cost is cheaper, return its cost.
InstructionCost VectorCallCost =		InstructionCost VectorCallCost =
TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);		TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
if (VectorCallCost < Cost) {		if (VectorCallCost < Cost) {
NeedToScalarize = false;		*Variant = VecFunc;
Cost = VectorCallCost;		Cost = VectorCallCost;
		david-armUnsubmitted Done Reply Inline Actions Do we really need both the `Variant` and the `NeedToScalarize` parameter? It looks naively that setting `Variant = VecFunc` is synonymous with `NeedToScalarize = false`. I haven't looked into this in detail so I could be wrong, but it might make more sense to remove the `NeedToScalarize` in favour of setting `Variant`? david-arm: Do we really need both the `Variant` and the `NeedToScalarize` parameter? It looks naively that…
}		}
return Cost;		return Cost;
}		}

static Type MaybeVectorizeType(Type Elt, ElementCount VF) {		static Type MaybeVectorizeType(Type Elt, ElementCount VF) {
if (VF.isScalar() \|\| (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))		if (VF.isScalar() \|\| (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
return Elt;		return Elt;
return VectorType::get(Elt, VF);		return VectorType::get(Elt, VF);
▲ Show 20 Lines • Show All 3,823 Lines • ▼ Show 20 Lines	case Instruction::FPTrunc: {
}		}

return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);		return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}		}
case Instruction::Call: {		case Instruction::Call: {
if (RecurrenceDescriptor::isFMulAddIntrinsic(I))		if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))		if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
return *RedCost;		return *RedCost;
bool NeedToScalarize;		Function *Variant;
CallInst *CI = cast<CallInst>(I);		CallInst *CI = cast<CallInst>(I);
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);		InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);
if (getVectorIntrinsicIDForCall(CI, TLI)) {		if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);		InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);		return std::min(CallCost, IntrinsicCost);
}		}
return CallCost;		return CallCost;
}		}
case Instruction::ExtractValue:		case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);		return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
▲ Show 20 Lines • Show All 986 Lines • ▼ Show 20 Lines	for (unsigned In = 0; In < NumIncoming; In++) {
if (EdgeMask)		if (EdgeMask)
OperandsWithMask.push_back(EdgeMask);		OperandsWithMask.push_back(EdgeMask);
}		}
return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));		return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
}		}

VPWidenCallRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,		VPWidenCallRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,
ArrayRef<VPValue *> Operands,		ArrayRef<VPValue *> Operands,
VFRange &Range) const {		VFRange &Range,
		VPlanPtr &Plan) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(		bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {		[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);		return CM.isScalarWithPredication(CI, VF);
},		},
Range);		Range);

if (IsPredicated)		if (IsPredicated)
return nullptr;		return nullptr;

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);		Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|		if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|		ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
ID == Intrinsic::pseudoprobe \|\|		ID == Intrinsic::pseudoprobe \|\|
ID == Intrinsic::experimental_noalias_scope_decl))		ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;		return nullptr;

ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());		SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));

// Is it beneficial to perform intrinsic call compared to lib call?		// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =		bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(		ID && LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {		[&](ElementCount VF) -> bool {
bool NeedToScalarize = false;		Function *Variant;
// Is it beneficial to perform intrinsic call compared to lib		// Is it beneficial to perform intrinsic call compared to lib
// call?		// call?
InstructionCost CallCost =		InstructionCost CallCost =
CM.getVectorCallCost(CI, VF, NeedToScalarize);		CM.getVectorCallCost(CI, VF, &Variant);
InstructionCost IntrinsicCost =		InstructionCost IntrinsicCost =
CM.getVectorIntrinsicCost(CI, VF);		CM.getVectorIntrinsicCost(CI, VF);
return IntrinsicCost <= CallCost;		return IntrinsicCost <= CallCost;
},		},
Range);		Range);
if (ShouldUseVectorIntrinsic)		if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);		return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);

		Function *Variant = nullptr;
		ElementCount VariantVF;
		bool NeedsMask = false;
// Is better to call a vectorized version of the function than to to scalarize		// Is better to call a vectorized version of the function than to to scalarize
// the call?		// the call?
auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(		auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {		[&](ElementCount VF) -> bool {
// The following case may be scalarized depending on the VF.		// The following case may be scalarized depending on the VF.
// The flag shows whether we can use a usual Call for vectorized		// The flag shows whether we can use a usual Call for vectorized
// version of the instruction.		// version of the instruction.
bool NeedToScalarize = false;
		david-armUnsubmitted Not Done Reply Inline Actions Doesn't this mean we may end up picking the least optimal VF? For example, if there are v2i32 and v4i32 masked variants we'll only ever pick the v2i32, i.e. the lowest VF? david-arm: Doesn't this mean we may end up picking the least optimal VF? For example, if there are v2i32…
		huntergrAuthorUnsubmitted Done Reply Inline Actions No. Since we now store the pointer to the Function in the recipe, we need to force vplan to generate different plans for each VF that has a vector variant available. See the vplan checks for 'test_v2_v4m' in synthesize-mask-for-call.ll -- there are separate VF=2 and VF=4 plans, with a widened call to different functions. huntergr: No. Since we now store the pointer to the Function in the recipe, we need to force vplan to…
		david-armUnsubmitted Done Reply Inline Actions OK, can you add some comments here explaining why you are forcing the creation of a new vplan for every subsequent VF after discovering a vector variant? david-arm: OK, can you add some comments here explaining why you are forcing the creation of a new vplan…
CM.getVectorCallCost(CI, VF, NeedToScalarize);		// If we've found a variant at a previous VF, then stop looking. A
return !NeedToScalarize;		// vectorized variant of a function expects input in a certain shape
		// -- basically the number of input registers, the number of lanes
		// per register, and whether there's a mask required.
		// We store a pointer to the variant in the VPWidenCallRecipe, so
		// once we have an appropriate variant it's only valid for that VF.
		// This will force a different vplan to be generated for each VF that
		// finds a valid variant.
		if (Variant)
		return false;
		CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);
		// If we found a valid vector variant at this VF, then store the VF
		// in case we need to generate a mask.
		if (Variant)
		VariantVF = VF;
		return Variant != nullptr;
},		},
Range);		Range);
if (ShouldUseVectorCall)		if (ShouldUseVectorCall) {
		if (NeedsMask) {
		// If our vector variant requires a mask, then synthesize an all-true
		// mask and insert it into the operands vector in the right place.
		VPValue *Mask = Plan->getOrAddVPValue(ConstantInt::getTrue(
		IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
		david-armUnsubmitted Done Reply Inline Actions nit: Could you change this to VFShape Shape = VFShape::get(CI, VariantVF, /HasGlobalPred=/true); david-arm:* nit: Could you change this to VFShape Shape = VFShape::get(*CI, VariantVF…

		VFShape Shape = VFShape::get(CI, VariantVF, /HasGlobalPred=*/true);
		unsigned MaskPos = 0;

		for (VFInfo Info : VFDatabase::getMappings(*CI))
		if (Info.Shape == Shape) {
		assert(Info.isMasked() && "Vector function info shape mismatch");
		MaskPos = Info.getParamIndexForOptionalMask().value();
		break;
		}

		Ops.insert(Ops.begin() + MaskPos, Mask);
		}

return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),		return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
Intrinsic::not_intrinsic);		Intrinsic::not_intrinsic, Variant);
		}

return nullptr;		return nullptr;
}		}

bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {		bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&		assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
!isa<StoreInst>(I) && "Instruction should have been handled earlier");		!isa<StoreInst>(I) && "Instruction should have been handled earlier");
// Instruction should be widened, unless it is scalar after vectorization,		// Instruction should be widened, unless it is scalar after vectorization,
▲ Show 20 Lines • Show All 254 Lines • ▼ Show 20 Lines	if (isa<TruncInst>(Instr) &&
return toVPRecipeResult(Recipe);		return toVPRecipeResult(Recipe);

// All widen recipes below deal only with VF > 1.		// All widen recipes below deal only with VF > 1.
if (LoopVectorizationPlanner::getDecisionAndClampRange(		if (LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return VF.isScalar(); }, Range))		[&](ElementCount VF) { return VF.isScalar(); }, Range))
return nullptr;		return nullptr;

if (auto *CI = dyn_cast<CallInst>(Instr))		if (auto *CI = dyn_cast<CallInst>(Instr))
return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));		return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));

if (isa<LoadInst>(Instr) \|\| isa<StoreInst>(Instr))		if (isa<LoadInst>(Instr) \|\| isa<StoreInst>(Instr))
return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));		return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));

if (!shouldWiden(Instr, Range))		if (!shouldWiden(Instr, Range))
return nullptr;		return nullptr;

if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))		if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
▲ Show 20 Lines • Show All 1,924 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	class VPRecipeBuilder {
/// performs full if-conversion.		/// performs full if-conversion.
VPRecipeOrVPValueTy tryToBlend(PHINode Phi, ArrayRef<VPValue > Operands,		VPRecipeOrVPValueTy tryToBlend(PHINode Phi, ArrayRef<VPValue > Operands,
VPlanPtr &Plan);		VPlanPtr &Plan);

/// Handle call instructions. If \p CI can be widened for \p Range.Start,		/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same		/// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
/// decision from \p Range.Start to \p Range.End.		/// decision from \p Range.Start to \p Range.End.
VPWidenCallRecipe tryToWidenCall(CallInst CI, ArrayRef<VPValue *> Operands,		VPWidenCallRecipe tryToWidenCall(CallInst CI, ArrayRef<VPValue *> Operands,
VFRange &Range) const;		VFRange &Range, VPlanPtr &Plan) const;

/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe		/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
/// if it can. The function should only be called if the cost-model indicates		/// if it can. The function should only be called if the cost-model indicates
/// that widening should be performed.		/// that widening should be performed.
VPRecipeBase tryToWiden(Instruction I, ArrayRef<VPValue *> Operands,		VPRecipeBase tryToWiden(Instruction I, ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB, VPlanPtr &Plan);		VPBasicBlock *VPBB, VPlanPtr &Plan);

/// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.		/// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
▲ Show 20 Lines • Show All 75 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlan.h

	Show First 20 Lines • Show All 925 Lines • ▼ Show 20 Lines
	#endif			#endif
	};			};

	/// A recipe for widening Call instructions.			/// A recipe for widening Call instructions.
	class VPWidenCallRecipe : public VPRecipeBase, public VPValue {			class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
	/// ID of the vector intrinsic to call when widening the call. If set the			/// ID of the vector intrinsic to call when widening the call. If set the
	/// Intrinsic::not_intrinsic, a library call will be used instead.			/// Intrinsic::not_intrinsic, a library call will be used instead.
	Intrinsic::ID VectorIntrinsicID;			Intrinsic::ID VectorIntrinsicID;
				/// If this recipe represents a library call, Variant stores a pointer to
				david-armUnsubmitted Done Reply Inline Actions Can you add some comments explaining what this is please? For example, that there should be one recipe for every VF because the variant requires a 1:1 mapping with the VF? david-arm: Can you add some comments explaining what this is please? For example, that there should be one…
				/// the chosen function. There is a 1:1 mapping between a given VF and the
				/// chosen vectorized variant, so there will be a different vplan for each
				/// VF with a valid variant.
				Function *Variant;

	public:			public:
	template <typename IterT>			template <typename IterT>
	VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,			VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
	Intrinsic::ID VectorIntrinsicID)			Intrinsic::ID VectorIntrinsicID,
				Function *Variant = nullptr)
	: VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I),			: VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I),
	VectorIntrinsicID(VectorIntrinsicID) {}			VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {}

	~VPWidenCallRecipe() override = default;			~VPWidenCallRecipe() override = default;

	VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)			VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)

	/// Produce a widened version of the call instruction.			/// Produce a widened version of the call instruction.
	void execute(VPTransformState &State) override;			void execute(VPTransformState &State) override;

	▲ Show 20 Lines • Show All 1,770 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Show First 20 Lines • Show All 448 Lines • ▼ Show 20 Lines
}		}

void VPWidenCallRecipe::execute(VPTransformState &State) {		void VPWidenCallRecipe::execute(VPTransformState &State) {
auto &CI = *cast<CallInst>(getUnderlyingInstr());		auto &CI = *cast<CallInst>(getUnderlyingInstr());
assert(!isa<DbgInfoIntrinsic>(CI) &&		assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");		"DbgInfoIntrinsic should have been dropped during VPlan construction");
State.setDebugLocFromInst(&CI);		State.setDebugLocFromInst(&CI);

SmallVector<Type *, 4> Tys;
for (Value *ArgOperand : CI.args())
Tys.push_back(
ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue()));

for (unsigned Part = 0; Part < State.UF; ++Part) {		for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Type *, 2> TysForDecl = {CI.getType()};		SmallVector<Type *, 2> TysForDecl = {CI.getType()};
SmallVector<Value *, 4> Args;		SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {		for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a		// Some intrinsics have a scalar argument - don't replace it with a
// vector.		// vector.
Value *Arg;		Value *Arg;
if (VectorIntrinsicID == Intrinsic::not_intrinsic \|\|		if (VectorIntrinsicID == Intrinsic::not_intrinsic \|\|
Show All 11 Lines	if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
// Use vector version of the intrinsic.		// Use vector version of the intrinsic.
if (State.VF.isVector())		if (State.VF.isVector())
TysForDecl[0] =		TysForDecl[0] =
VectorType::get(CI.getType()->getScalarType(), State.VF);		VectorType::get(CI.getType()->getScalarType(), State.VF);
Module *M = State.Builder.GetInsertBlock()->getModule();		Module *M = State.Builder.GetInsertBlock()->getModule();
VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);		VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");		assert(VectorF && "Can't retrieve vector intrinsic.");
} else {		} else {
// Use vector version of the function call.
const VFShape Shape = VFShape::get(CI, State.VF, false /HasGlobalPred/);
#ifndef NDEBUG		#ifndef NDEBUG
assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&		assert(Variant != nullptr && "Can't create vector function.");
"Can't create vector function.");
#endif		#endif
VectorF = VFDatabase(CI).getVectorizedFunction(Shape);		VectorF = Variant;
}		}
		fhahnUnsubmitted Done Reply Inline Actions It would be good if the decision whether to used the masked or non-masked variant would be taken at the time of VPlan construction instead of during executing. It would probably also be good to pass in the mask as operand to the recipe, especially if we want to support non-trivial masks in the future. fhahn: It would be good if the decision whether to used the masked or non-masked variant would be…
		huntergrAuthorUnsubmitted Not Done Reply Inline Actions So the problem I had with trying to decide up front was that you might have both masked and unmasked variants available, and the decision on which one to use left to the cost model -- which I think is calculated after VPlan construction. For example, on AArch64 you might have a non-masked NEON variant and a masked SVE variant. If you know the implementation width is 128b, then the cost would be slightly higher for generating the mask for the SVE variant. If it's 256b or higher, it might be worth the extra cost due to additional parallelism. Is there a (straightforward) way to tell VPlan that it may need to construct different recipes based on masked/non-masked variants being available? Or would this need some reworking of VPlan? I did add the mask as an operand in the case where it is required though. If we can generate multiple recipes easily then it can be added to the operands when a dummy mask is required (and possibly shared if there are multiple calls, giving a more accurate cost). huntergr: So the problem I had with trying to decide up front was that you might have both masked and…

SmallVector<OperandBundleDef, 1> OpBundles;		SmallVector<OperandBundleDef, 1> OpBundles;
CI.getOperandBundlesAsDefs(OpBundles);		CI.getOperandBundlesAsDefs(OpBundles);
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);		CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);

if (isa<FPMathOperator>(V))		if (isa<FPMathOperator>(V))
V->copyFastMathFlags(&CI);		V->copyFastMathFlags(&CI);

State.set(this, V, Part);		State.set(this, V, Part);
Show All 15 Lines	void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
}		}

O << "call @" << CI->getCalledFunction()->getName() << "(";		O << "call @" << CI->getCalledFunction()->getName() << "(";
printOperands(O, SlotTracker);		printOperands(O, SlotTracker);
O << ")";		O << ")";

if (VectorIntrinsicID)		if (VectorIntrinsicID)
O << " (using vector intrinsic)";		O << " (using vector intrinsic)";
else		else {
O << " (using library function)";		O << " (using library function";
		if (Variant->hasName())
		O << ": " << Variant->getName();
		O << ")";
		}
}		}

void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,		void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {		VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-SELECT ";		O << Indent << "WIDEN-SELECT ";
printAsOperand(O, SlotTracker);		printAsOperand(O, SlotTracker);
O << " = select ";		O << " = select ";
getOperand(0)->printAsOperand(O, SlotTracker);		getOperand(0)->printAsOperand(O, SlotTracker);
▲ Show 20 Lines • Show All 796 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
		reamesUnsubmitted Done Reply Inline Actions Please autogenerate these tests for readability. reames: Please autogenerate these tests for readability.
; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S \| FileCheck %s --check-prefixes=TFNONE		; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S \| FileCheck %s --check-prefixes=TFNONE
; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S \| FileCheck %s --check-prefixes=TFALWAYS		; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S \| FileCheck %s --check-prefixes=TFALWAYS
; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S \| FileCheck %s --check-prefixes=TFFALLBACK		; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S \| FileCheck %s --check-prefixes=TFFALLBACK

target triple = "aarch64-unknown-linux-gnu"		target triple = "aarch64-unknown-linux-gnu"

; A call whose argument must be widened. We check that tail folding uses the		; A call whose argument must be widened. We check that tail folding uses the
; primary mask, and that without tail folding we synthesize an all-true mask.		; primary mask, and that without tail folding we synthesize an all-true mask.
define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {		define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
; TFNONE-LABEL: @test_widen(		; TFNONE-LABEL: @test_widen(
; TFNONE-NEXT: entry:		; TFNONE-NEXT: entry:
; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]		; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
		; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
		; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
		; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
; TFNONE: vector.ph:		; TFNONE: vector.ph:
		; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
		; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
		; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
		; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]		; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]
; TFNONE: vector.body:		; TFNONE: vector.body:
; TFNONE-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]		; TFNONE-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
; TFNONE-NEXT: [[TMP0:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]		; TFNONE-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]
; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4		; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
; TFNONE-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0		; TFNONE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]		; TFNONE-NEXT: [[TMP6:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]
; TFNONE-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1		; TFNONE-NEXT: store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
; TFNONE-NEXT: [[TMP5:%.*]] = call i64 @foo(i64 [[TMP4]]) #[[ATTR2]]		; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TFNONE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0		; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
; TFNONE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP5]], i32 1		; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; TFNONE-NEXT: [[TMP8:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]		; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFNONE-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP8]], align 4		; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; TFNONE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; TFNONE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFNONE: middle.block:		; TFNONE: middle.block:
; TFNONE-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]		; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
		; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; TFNONE: scalar.ph:		; TFNONE: scalar.ph:
; TFNONE-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]		; TFNONE-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
; TFNONE-NEXT: br label [[FOR_BODY:%.*]]		; TFNONE-NEXT: br label [[FOR_BODY:%.*]]
; TFNONE: for.body:		; TFNONE: for.body:
; TFNONE-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]		; TFNONE-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]		; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4		; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2]]		; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]		; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4		; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1		; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024		; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]		; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TFNONE: for.cond.cleanup:		; TFNONE: for.cond.cleanup:
; TFNONE-NEXT: ret void		; TFNONE-NEXT: ret void
;		;
; TFALWAYS-LABEL: @test_widen(		; TFALWAYS-LABEL: @test_widen(
; TFALWAYS-NEXT: entry:		; TFALWAYS-NEXT: entry:
; TFALWAYS-NEXT: br label [[FOR_BODY:%.*]]		; TFALWAYS-NEXT: br label [[FOR_BODY:%.*]]
; TFALWAYS: for.body:		; TFALWAYS: for.body:
; TFALWAYS-NEXT: [[INDVARS_IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]		; TFALWAYS-NEXT: [[INDVARS_IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; TFALWAYS-NEXT: [[GEP:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDVARS_IV]]		; TFALWAYS-NEXT: [[GEP:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDVARS_IV]]
; TFALWAYS-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4		; TFALWAYS-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
; TFALWAYS-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]		; TFALWAYS-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]
; TFALWAYS-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDVARS_IV]]		; TFALWAYS-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDVARS_IV]]
; TFALWAYS-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4		; TFALWAYS-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
; TFALWAYS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1		; TFALWAYS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; TFALWAYS-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024		; TFALWAYS-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
; TFALWAYS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]		; TFALWAYS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
; TFALWAYS: for.cond.cleanup:		; TFALWAYS: for.cond.cleanup:
; TFALWAYS-NEXT: ret void		; TFALWAYS-NEXT: ret void
;		;
; TFFALLBACK-LABEL: @test_widen(		; TFFALLBACK-LABEL: @test_widen(
; TFFALLBACK-NEXT: entry:		; TFFALLBACK-NEXT: entry:
; TFFALLBACK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]		; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
		; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
		; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
		; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
; TFFALLBACK: vector.ph:		; TFFALLBACK: vector.ph:
		; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
		; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
		; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
		; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]]		; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]]
; TFFALLBACK: vector.body:		; TFFALLBACK: vector.body:
; TFFALLBACK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]		; TFFALLBACK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
; TFFALLBACK-NEXT: [[TMP0:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]		; TFFALLBACK-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]
; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4		; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
; TFFALLBACK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0		; TFFALLBACK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]		; TFFALLBACK-NEXT: [[TMP6:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]
; TFFALLBACK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1		; TFFALLBACK-NEXT: store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
; TFFALLBACK-NEXT: [[TMP5:%.*]] = call i64 @foo(i64 [[TMP4]]) #[[ATTR2]]		; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TFFALLBACK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0		; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
; TFFALLBACK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP5]], i32 1		; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; TFFALLBACK-NEXT: [[TMP8:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]		; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFFALLBACK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP8]], align 4		; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; TFFALLBACK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; TFFALLBACK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TFFALLBACK: middle.block:		; TFFALLBACK: middle.block:
; TFFALLBACK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]		; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
		; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; TFFALLBACK: scalar.ph:		; TFFALLBACK: scalar.ph:
; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]		; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]]		; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]]
; TFFALLBACK: for.body:		; TFFALLBACK: for.body:
; TFFALLBACK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]		; TFFALLBACK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]		; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4		; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2]]		; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]		; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4		; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1		; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; TFFALLBACK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024		; TFFALLBACK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
; TFFALLBACK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]		; TFFALLBACK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TFFALLBACK: for.cond.cleanup:		; TFFALLBACK: for.cond.cleanup:
; TFFALLBACK-NEXT: ret void		; TFFALLBACK-NEXT: ret void
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body:		for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
▲ Show 20 Lines • Show All 229 Lines • ▼ Show 20 Lines
; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2		; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]		; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]		; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]		; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]
; TFNONE: vector.body:		; TFNONE: vector.body:
; TFNONE-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]		; TFNONE-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
; TFNONE-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]		; TFNONE-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]
; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4		; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
; TFNONE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])		; TFNONE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
; TFNONE-NEXT: [[TMP7:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]		; TFNONE-NEXT: [[TMP6:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]
; TFNONE-NEXT: store <vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], align 4		; TFNONE-NEXT: store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()		; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2		; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]		; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]		; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]		; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TFNONE: middle.block:		; TFNONE: middle.block:
; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]		; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]		; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; TFNONE: scalar.ph:		; TFNONE: scalar.ph:
; TFNONE-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]		; TFNONE-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
; TFNONE-NEXT: br label [[FOR_BODY:%.*]]		; TFNONE-NEXT: br label [[FOR_BODY:%.*]]
; TFNONE: for.body:		; TFNONE: for.body:
; TFNONE-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]		; TFNONE-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
Show All 35 Lines
; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2		; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]		; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]		; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]]		; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]]
; TFFALLBACK: vector.body:		; TFFALLBACK: vector.body:
; TFFALLBACK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]		; TFFALLBACK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
; TFFALLBACK-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]		; TFFALLBACK-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]
; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4		; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
; TFFALLBACK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])		; TFFALLBACK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
; TFFALLBACK-NEXT: [[TMP7:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]		; TFFALLBACK-NEXT: [[TMP6:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]
; TFFALLBACK-NEXT: store <vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], align 4		; TFFALLBACK-NEXT: store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
; TFFALLBACK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()		; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TFFALLBACK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2		; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]		; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; TFFALLBACK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]		; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFFALLBACK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]		; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TFFALLBACK: middle.block:		; TFFALLBACK: middle.block:
; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]		; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]		; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; TFFALLBACK: scalar.ph:		; TFFALLBACK: scalar.ph:
; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]		; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]]		; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]]
; TFFALLBACK: for.body:		; TFFALLBACK: for.body:
; TFFALLBACK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]		; TFFALLBACK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines
; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2		; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]		; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]		; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]		; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]]
; TFNONE: vector.body:		; TFNONE: vector.body:
; TFNONE-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]		; TFNONE-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
; TFNONE-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]		; TFNONE-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]
; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4		; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
; TFNONE-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])		; TFNONE-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
; TFNONE-NEXT: [[TMP7:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]		; TFNONE-NEXT: [[TMP6:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]
; TFNONE-NEXT: store <vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], align 4		; TFNONE-NEXT: store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()		; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2		; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]		; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]		; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]		; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TFNONE: middle.block:		; TFNONE: middle.block:
; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]		; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]		; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; TFNONE: scalar.ph:		; TFNONE: scalar.ph:
; TFNONE-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]		; TFNONE-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
; TFNONE-NEXT: br label [[FOR_BODY:%.*]]		; TFNONE-NEXT: br label [[FOR_BODY:%.*]]
; TFNONE: for.body:		; TFNONE: for.body:
; TFNONE-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]		; TFNONE-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
Show All 35 Lines
; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2		; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]		; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]		; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]]		; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]]
; TFFALLBACK: vector.body:		; TFFALLBACK: vector.body:
; TFFALLBACK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]		; TFFALLBACK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
; TFFALLBACK-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]		; TFFALLBACK-NEXT: [[TMP4:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[INDEX]]
; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4		; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
; TFFALLBACK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])		; TFFALLBACK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
; TFFALLBACK-NEXT: [[TMP7:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]		; TFFALLBACK-NEXT: [[TMP6:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[INDEX]]
; TFFALLBACK-NEXT: store <vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], align 4		; TFFALLBACK-NEXT: store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
; TFFALLBACK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()		; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TFFALLBACK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2		; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]		; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
; TFFALLBACK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]		; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFFALLBACK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]		; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TFFALLBACK: middle.block:		; TFFALLBACK: middle.block:
; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]		; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]		; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; TFFALLBACK: scalar.ph:		; TFFALLBACK: scalar.ph:
; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]		; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]]		; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]]
; TFFALLBACK: for.body:		; TFFALLBACK: for.body:
; TFFALLBACK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]		; TFFALLBACK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
Show All 23 Lines	for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body		br i1 %exitcond, label %for.cond.cleanup, label %for.body

for.cond.cleanup:		for.cond.cleanup:
ret void		ret void
}		}

declare i64 @foo(i64)		declare i64 @foo(i64)

; vector variants of foo		;; scalable vector variants of foo
declare <vscale x 2 x i64> @foo_uniform(i64, <vscale x 2 x i1>)		declare <vscale x 2 x i64> @foo_uniform(i64, <vscale x 2 x i1>)
declare <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64>, <vscale x 2 x i1>)		declare <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64>, <vscale x 2 x i1>)
declare <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64>)		declare <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64>)

attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector),_ZGV_LLVM_Mxu_foo(foo_uniform)" }		attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector),_ZGV_LLVM_Mxu_foo(foo_uniform)" }
attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }		attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
attributes #2 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vector_nomask)" }		attributes #2 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vector_nomask)" }
attributes #3 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vector_nomask),_ZGV_LLVM_Mxv_foo(foo_vector)" }		attributes #3 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vector_nomask),_ZGV_LLVM_Mxv_foo(foo_vector)" }
attributes #4 = { "target-features"="+sve" vscale_range(2,16) "no-trapping-math"="false" }		attributes #4 = { "target-features"="+sve" vscale_range(2,16) "no-trapping-math"="false" }

llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; REQUIRES: asserts

				; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -debug-only=loop-vectorize -S < %s 2>&1 \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				;; Given the choice between a masked and unmasked variant for the same VF (4)
				;; where no mask is required, make sure we choose the unmasked variant.
				david-armUnsubmitted Done Reply Inline Actions I think it's also worth having a test for the case when both VF=2 and VF=4 use unmasked variants, because even in this case we must create separate vplans for each. The `Variant` member added to the recipe requires it now I think. david-arm: I think it's also worth having a test for the case when both VF=2 and VF=4 use unmasked…

				; CHECK-LABEL: LV: Checking a loop in 'test_v4_v4m'
				; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' {
				; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
				; CHECK-EMPTY:
				; CHECK-NEXT: vector.ph:
				; CHECK-NEXT: Successor(s): vector loop
				; CHECK-EMPTY:
				; CHECK-NEXT: <x1> vector loop: {
				; CHECK-NEXT: vector.body:
				; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
				; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
				; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3>
				; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep>
				; CHECK-NEXT: REPLICATE ir<%call> = call @foo(ir<%load>)
				; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3>
				; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call>
				; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2>
				; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1>
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }
				; CHECK-NEXT: Successor(s): middle.block
				; CHECK-EMPTY:
				; CHECK-NEXT: middle.block:
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }

				; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
				; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
				; CHECK-EMPTY:
				; CHECK-NEXT: vector.ph:
				; CHECK-NEXT: Successor(s): vector loop
				; CHECK-EMPTY:
				; CHECK-NEXT: <x1> vector loop: {
				; CHECK-NEXT: vector.body:
				; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
				; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
				; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3>
				; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep>
				; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask)
				; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3>
				; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call>
				; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2>
				; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1>
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }
				; CHECK-NEXT: Successor(s): middle.block
				; CHECK-EMPTY:
				; CHECK-NEXT: middle.block:
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }

				;; If we have a masked variant at one VF and an unmasked variant at a different
				;; VF, ensure we create appropriate recipes (including a synthesized all-true
				;; mask for the masked variant)

				; CHECK-LABEL: LV: Checking a loop in 'test_v2_v4m'
				; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' {
				; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
				; CHECK-EMPTY:
				; CHECK-NEXT: vector.ph:
				; CHECK-NEXT: Successor(s): vector loop
				; CHECK-EMPTY:
				; CHECK-NEXT: <x1> vector loop: {
				; CHECK-NEXT: vector.body:
				; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
				; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
				; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3>
				; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep>
				; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask)
				; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3>
				; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call>
				; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2>
				; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1>
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }
				; CHECK-NEXT: Successor(s): middle.block
				; CHECK-EMPTY:
				; CHECK-NEXT: middle.block:
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }

				; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
				; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
				; CHECK-EMPTY:
				; CHECK-NEXT: vector.ph:
				; CHECK-NEXT: Successor(s): vector loop
				; CHECK-EMPTY:
				; CHECK-NEXT: <x1> vector loop: {
				; CHECK-NEXT: vector.body:
				; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
				; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
				; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3>
				; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep>
				; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>, ir<true>) (using library function: foo_vector_fixed4_mask)
				; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3>
				; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call>
				; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2>
				; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1>
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }
				; CHECK-NEXT: Successor(s): middle.block
				; CHECK-EMPTY:
				; CHECK-NEXT: middle.block:
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }

				;; If we have two variants at different VFs, neither of which are masked, we
				;; still expect to see a different vplan per VF.

				; CHECK-LABEL: LV: Checking a loop in 'test_v2_v4'
				; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' {
				; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
				; CHECK-EMPTY:
				; CHECK-NEXT: vector.ph:
				; CHECK-NEXT: Successor(s): vector loop
				; CHECK-EMPTY:
				; CHECK-NEXT: <x1> vector loop: {
				; CHECK-NEXT: vector.body:
				; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
				; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
				; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3>
				; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep>
				; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask)
				; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3>
				; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call>
				; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2>
				; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1>
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }
				; CHECK-NEXT: Successor(s): middle.block
				; CHECK-EMPTY:
				; CHECK-NEXT: middle.block:
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }

				; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
				; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
				; CHECK-EMPTY:
				; CHECK-NEXT: vector.ph:
				; CHECK-NEXT: Successor(s): vector loop
				; CHECK-EMPTY:
				; CHECK-NEXT: <x1> vector loop: {
				; CHECK-NEXT: vector.body:
				; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
				; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
				; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3>
				; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep>
				; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask)
				; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3>
				; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call>
				; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2>
				; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1>
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }
				; CHECK-NEXT: Successor(s): middle.block
				; CHECK-EMPTY:
				; CHECK-NEXT: middle.block:
				; CHECK-NEXT: No successors
				; CHECK-NEXT: }

				define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
				; CHECK-LABEL: @test_v4_v4m(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
				; CHECK-NEXT: [[TMP1:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[TMP0]]
				; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
				; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
				; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
				; CHECK-NEXT: [[TMP4:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[TMP0]]
				; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
				; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
				; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
				; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
				; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
				; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
				; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]
				; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
				; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
				; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body

				for.body:
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
				%gep = getelementptr i64, ptr %b, i64 %indvars.iv
				%load = load i64, ptr %gep
				%call = call i64 @foo(i64 %load) #0
				%arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv
				store i64 %call, ptr %arrayidx
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 1024
				br i1 %exitcond, label %for.cond.cleanup, label %for.body

				for.cond.cleanup:
				ret void

				}

				define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
				; CHECK-LABEL: @test_v2_v4m(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
				; CHECK-NEXT: [[TMP1:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[TMP0]]
				; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
				; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
				; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
				; CHECK-NEXT: [[TMP4:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[TMP0]]
				; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
				; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
				; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
				; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
				; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
				; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
				; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
				; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
				; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
				; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body

				for.body:
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
				%gep = getelementptr i64, ptr %b, i64 %indvars.iv
				%load = load i64, ptr %gep
				%call = call i64 @foo(i64 %load) #1
				%arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv
				store i64 %call, ptr %arrayidx
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 1024
				br i1 %exitcond, label %for.cond.cleanup, label %for.body

				for.cond.cleanup:
				ret void

				}

				define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
				; CHECK-LABEL: @test_v2_v4(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
				; CHECK-NEXT: [[TMP1:%.]] = getelementptr i64, ptr [[B:%.]], i64 [[TMP0]]
				; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
				; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
				; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
				; CHECK-NEXT: [[TMP4:%.]] = getelementptr inbounds i64, ptr [[A:%.]], i64 [[TMP0]]
				; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
				; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
				; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
				; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
				; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVARS_IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
				; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
				; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
				; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
				; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
				; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body

				for.body:
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
				%gep = getelementptr i64, ptr %b, i64 %indvars.iv
				%load = load i64, ptr %gep
				%call = call i64 @foo(i64 %load) #2
				%arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv
				store i64 %call, ptr %arrayidx
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 1024
				br i1 %exitcond, label %for.cond.cleanup, label %for.body

				for.cond.cleanup:
				ret void

				}

				declare i64 @foo(i64)

				;; fixed vector variants of foo
				declare <2 x i64> @foo_vector_fixed2_nomask(<2 x i64>)
				declare <4 x i64> @foo_vector_fixed4_nomask(<4 x i64>)
				declare <4 x i64> @foo_vector_fixed4_mask(<4 x i64>, <4 x i1>)

				attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N4v_foo(foo_vector_fixed4_nomask),_ZGV_LLVM_M4v_foo(foo_vector_fixed4_mask)" }
				attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_foo(foo_vector_fixed2_nomask),_ZGV_LLVM_M4v_foo(foo_vector_fixed4_mask)" }
				attributes #2 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_foo(foo_vector_fixed2_nomask),_ZGV_LLVM_N4v_foo(foo_vector_fixed4_nomask)" }
				attributes #3 = { "target-features"="+sve" vscale_range(2,16) "no-trapping-math"="false" }

llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll

	Show All 13 Lines
	; CHECK-EMPTY:			; CHECK-EMPTY:
	; CHECK-NEXT: <x1> vector loop: {			; CHECK-NEXT: <x1> vector loop: {
	; CHECK-NEXT: vector.body:			; CHECK-NEXT: vector.body:
	; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION			; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
	; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>			; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
	; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3>			; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3>
	; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src>			; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src>
	; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l>			; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l>
	; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function)			; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64)
	; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3>			; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3>
	; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst>			; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst>
	; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2>			; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2>
	; CHECK-NEXT: EMIT branch-on-count vp<%10> vp<%1>			; CHECK-NEXT: EMIT branch-on-count vp<%10> vp<%1>
	; CHECK-NEXT: No successors			; CHECK-NEXT: No successors
	; CHECK-NEXT: }			; CHECK-NEXT: }
	; CHECK-NEXT: Successor(s): middle.block			; CHECK-NEXT: Successor(s): middle.block
	; CHECK-EMPTY:			; CHECK-EMPTY:
	▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Synthesize mask operands for vector variants as needed
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 497308

llvm/include/llvm/Analysis/VectorUtils.h

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll

llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Synthesize mask operands for vector variants as neededClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 497308

llvm/include/llvm/Analysis/VectorUtils.h

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll

llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll

[LoopVectorize] Synthesize mask operands for vector variants as needed
ClosedPublic