Diff 443936

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,671 Lines • ▼ Show 20 Lines	private:

/// A type representing the costs for instructions if they were to be		/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost		/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.		/// pairs.
using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;		using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;

/// A set containing all BasicBlocks that are known to present after		/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.		/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;		DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
		PredicatedBBsAfterVectorization;

/// Records whether it is allowed to have the original scalar loop execute at		/// Records whether it is allowed to have the original scalar loop execute at
/// least once. This may be needed as a fallback loop in case runtime		/// least once. This may be needed as a fallback loop in case runtime
/// aliasing/dependence checks fail, or to handle the tail/remainder		/// aliasing/dependence checks fail, or to handle the tail/remainder
/// iterations when the trip count is unknown or doesn't divide by the VF,		/// iterations when the trip count is unknown or doesn't divide by the VF,
/// or as a peel-loop to handle gaps in interleave-groups.		/// or as a peel-loop to handle gaps in interleave-groups.
/// Under optsize and when the trip count is very small we don't allow any		/// Under optsize and when the trip count is very small we don't allow any
/// iterations to execute in the scalar loop.		/// iterations to execute in the scalar loop.
▲ Show 20 Lines • Show All 4,394 Lines • ▼ Show 20 Lines	if (VF.isScalar() \|\| VF.isZero() \|\|
InstsToScalarize.find(VF) != InstsToScalarize.end())		InstsToScalarize.find(VF) != InstsToScalarize.end())
return;		return;

// Initialize a mapping for VF in InstsToScalalarize. If we find that it's		// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
// not profitable to scalarize any instructions, the presence of VF in the		// not profitable to scalarize any instructions, the presence of VF in the
// map will indicate that we've analyzed it already.		// map will indicate that we've analyzed it already.
ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];		ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

		PredicatedBBsAfterVectorization[VF].clear();

// Find all the instructions that are scalar with predication in the loop and		// Find all the instructions that are scalar with predication in the loop and
// determine if it would be better to not if-convert the blocks they are in.		// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.		// If so, we also record the instructions to scalarize.
for (BasicBlock *BB : TheLoop->blocks()) {		for (BasicBlock *BB : TheLoop->blocks()) {
if (!blockNeedsPredicationForAnyReason(BB))		if (!blockNeedsPredicationForAnyReason(BB))
continue;		continue;
for (Instruction &I : *BB)		for (Instruction &I : *BB)
if (isScalarWithPredication(&I, VF)) {		if (isScalarWithPredication(&I, VF)) {
ScalarCostsTy ScalarCosts;		ScalarCostsTy ScalarCosts;
// Do not apply discount if scalable, because that would lead to		// Do not apply discount if scalable, because that would lead to
// invalid scalarization costs.		// invalid scalarization costs.
// Do not apply discount logic if hacked cost is needed		// Do not apply discount logic if hacked cost is needed
// for emulated masked memrefs.		// for emulated masked memrefs.
if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&		if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)		computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());		ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.		// Remember that BB will remain after vectorization.
PredicatedBBsAfterVectorization.insert(BB);		PredicatedBBsAfterVectorization[VF].insert(BB);
}		}
}		}
}		}

int LoopVectorizationCostModel::computePredInstDiscount(		int LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {		Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
assert(!isUniformAfterVectorization(PredInst, VF) &&		assert(!isUniformAfterVectorization(PredInst, VF) &&
"Instruction marked uniform-after-vectorization will be predicated");		"Instruction marked uniform-after-vectorization will be predicated");
▲ Show 20 Lines • Show All 848 Lines • ▼ Show 20 Lines	case Instruction::GetElementPtr:
return 0;		return 0;
case Instruction::Br: {		case Instruction::Br: {
// In cases of scalarized and predicated instructions, there will be VF		// In cases of scalarized and predicated instructions, there will be VF
// predicated blocks in the vectorized loop. Each branch around these		// predicated blocks in the vectorized loop. Each branch around these
// blocks requires also an extract of its vector compare i1 element.		// blocks requires also an extract of its vector compare i1 element.
bool ScalarPredicatedBB = false;		bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);		BranchInst *BI = cast<BranchInst>(I);
if (VF.isVector() && BI->isConditional() &&		if (VF.isVector() && BI->isConditional() &&
(PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) \|\|		(PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) \|\|
PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))		PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
ScalarPredicatedBB = true;		ScalarPredicatedBB = true;

if (ScalarPredicatedBB) {		if (ScalarPredicatedBB) {
// Not possible to scalarize scalable vector with predicated instructions.		// Not possible to scalarize scalable vector with predicated instructions.
if (VF.isScalable())		if (VF.isScalable())
return InstructionCost::getInvalid();		return InstructionCost::getInvalid();
// Return cost for branches around scalarized and predicated blocks.		// Return cost for branches around scalarized and predicated blocks.
auto *Vec_i1Ty =		auto *Vec_i1Ty =
▲ Show 20 Lines • Show All 3,647 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll

This file was added.

				; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize <%s \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				; The uniform load of %d in the following loop triggers the special
				; branch costing code in LoopVectorizationCostModel::getInstructionCost.
				; However, this should only affect the fixed-width cost because for
				sdesmalenUnsubmitted Done Reply Inline Actions nit: Can this test be reduced a little bit more, I'm not sure if `%boff`, `%coff` and all of the pointers are really needed for this test. sdesmalen: nit: Can this test be reduced a little bit more, I'm not sure if `%boff`, `%coff` and all of…
				; NEON it needs to scalarize the load, whereas for SVE it can use a predicated load.
				; Because of how the LoopVectorizer annotates the load to need scalarization with
				sdesmalenUnsubmitted Done Reply Inline Actions nit: can you clean up this test a bit? `dso_local` `nocapture`, etc aren't really required for the LV test. Also, can you use some constant FP value instead of loading from `@globval` ? sdesmalen: nit: can you clean up this test a bit? `dso_local` `nocapture`, etc aren't really required for…
				david-armAuthorUnsubmitted Done Reply Inline Actions I can try using other types of uniform load, so long as the load is treated as scalar with predication for a fixed-width VF. david-arm: I can try using other types of uniform load, so long as the load is treated as scalar with…
				; predicated blocks, this leads to different costs for the branch instruction.
				;
				; NOTE: This test assumes we will never use a fixed-width VF due to
				; the high cost of scalarizing the masked store, however this assumption may
				; break in future if we permit the use of SVE loads and stores to perform the
				; fixed-width operations.
				define i32 @uniform_load(i64 %n, ptr readnone %c, ptr %d) #0 {
				sdesmalenUnsubmitted Done Reply Inline Actions If it selects VF=vscale x 4 then it seems redundant to check there is no invalid cost for that chosen VF. If you want to check that it uses a scalable VF, you don't need the debug-output to tell you it chose that. You could remove the `REQUIRES: asserts`, the `-debug` option from the RUN line and instead check for `<vscale x 4 x i32>` in the resulting IR. What do you think? Just a note that this test may become redundant in the future if SVE is used for fixed-width vectors and it can lower fixed-width masked load/store operations to use SVE. Maybe you can add a note here that this assumption is important for the test and may change in the future? sdesmalen: If it selects VF=vscale x 4 then it seems redundant to check there is no invalid cost for that…
				; CHECK-LABEL: @uniform_load(
				sdesmalenUnsubmitted Done Reply Inline Actions What in this loop is causing `PredicatedBBsAfterVectorization` to be `true` for fixed-width vectors? sdesmalen: What in this loop is causing `PredicatedBBsAfterVectorization` to be `true` for fixed-width…
				david-armAuthorUnsubmitted Done Reply Inline Actions It's the uniform load of `@globval`, which causes LoopVectorizationCostModel::collectInstsToScalarize to insert the BB into PredicatedBBsAfterVectorization. david-arm: It's the uniform load of `@globval`, which causes LoopVectorizationCostModel…
				sdesmalenUnsubmitted Done Reply Inline Actions Okay thanks, can you just add a comment to the test describing this? I didn't realise that the load from @globval had this effect. sdesmalen: Okay thanks, can you just add a comment to the test describing this? I didn't realise that the…
				; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>
				entry:
				br label %for.body

				for.body: ; preds = %entry, %for.body
				%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
				%load2 = load float, ptr %d, align 4
				%arrayidx2 = getelementptr inbounds float, ptr %c, i64 %indvars.iv
				store float %load2, ptr %arrayidx2, align 4
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond.not = icmp eq i64 %indvars.iv.next, %n
				br i1 %exitcond.not, label %for.end, label %for.body

				for.end: ; preds = %for.body
				ret i32 0
				}

				attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Change PredicatedBBsAfterVectorization to be per VF
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 443936

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Change PredicatedBBsAfterVectorization to be per VFClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 443936

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll

[LoopVectorize] Change PredicatedBBsAfterVectorization to be per VF
ClosedPublic