This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors
ClosedPublic

Authored by david-arm on Nov 4 2021, 4:01 AM.

Download Raw Diff

Details

Reviewers

sdesmalen
kmclaughlin
peterwaller-arm
fhahn

Commits

rGb0922a9dcd11: [LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors

Summary

The code in VPWidenCanonicalIVRecipe::execute only worked for fixed-width
vectors due to the way we generate the values per lane. This patch changes
the code to use a combination of vector splats and step vectors to get
the same result. This then works for both fixed-width and scalable vectors.

Tests that exercise this code path for scalable vectors have been added here:

Transforms/LoopVectorize/AArch64/sve-tail-folding.ll

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

david-arm created this revision.Nov 4 2021, 4:01 AM

Herald added subscribers: ctetreau, rogfer01, hiraditya, kristof.beyls. · View Herald TranscriptNov 4 2021, 4:01 AM

david-arm requested review of this revision.Nov 4 2021, 4:01 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 4 2021, 4:01 AM

Herald added subscribers: llvm-commits, vkmr. · View Herald Transcript

david-arm added a parent revision: D113003: [LoopVectorize] Add support for tail folding using scalable vectors.Nov 4 2021, 4:02 AM

Harbormaster completed remote builds in B132416: Diff 384698.Nov 4 2021, 4:02 AM

david-arm added a parent revision: D113122: [NFC][LoopVectorize] Make the createStepForVF interface more caller-friendly.Nov 4 2021, 4:02 AM

LGTM!

This revision is now accepted and ready to land.Nov 10 2021, 5:28 AM

Matt added a subscriber: Matt.Nov 17 2021, 12:03 PM

Rebase.

Harbormaster completed remote builds in B137837: Diff 392299.Dec 7 2021, 1:25 AM

This revision was landed with ongoing or failed builds.Jan 10 2022, 6:12 AM

Closed by commit rGb0922a9dcd11: [LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors (authored by david-arm). · Explain Why

This revision was automatically updated to reflect the committed changes.

david-arm added a commit: rGb0922a9dcd11: [LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors.

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

VPlan.cpp

18 lines

test/

Transforms/

LoopVectorize/

AArch64/

sve-tail-folding.ll

133 lines

Diff 398607

llvm/lib/Transforms/Vectorize/VPlan.cpp

	Show First 20 Lines • Show All 1,350 Lines • ▼ Show 20 Lines
	}			}
	#endif			#endif

	void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {			void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
	Value *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);			Value *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
	Type *STy = CanonicalIV->getType();			Type *STy = CanonicalIV->getType();
	IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());			IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
	ElementCount VF = State.VF;			ElementCount VF = State.VF;
	assert(!VF.isScalable() && "the code following assumes non scalables ECs");
	Value *VStart = VF.isScalar()			Value *VStart = VF.isScalar()
	? CanonicalIV			? CanonicalIV
	: Builder.CreateVectorSplat(VF.getKnownMinValue(),			: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
	CanonicalIV, "broadcast");
	for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {			for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
	SmallVector<Constant *, 8> Indices;			Value *VStep = createStepForVF(Builder, STy, VF, Part);
	for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)			if (VF.isVector()) {
	Indices.push_back(			VStep = Builder.CreateVectorSplat(VF, VStep);
	ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));			VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
	// If VF == 1, there is only one iteration in the loop above, thus the			}
	// element pushed back into Indices is ConstantInt::get(STy, Part)
	Constant *VStep =
	VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
	// Add the consecutive indices to the vector value.
	Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");			Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
	State.set(this, CanonicalVectorIV, Part);			State.set(this, CanonicalVectorIV, Part);
	}			}
	}			}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)			#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,			void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
	VPSlotTracker &SlotTracker) const {			VPSlotTracker &SlotTracker) const {
	▲ Show 20 Lines • Show All 228 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll

Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines	while.body: ; preds = %while.body, %entry
%cmp10 = icmp ult i64 %index.next, %n		%cmp10 = icmp ult i64 %index.next, %n
br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0		br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0

while.end.loopexit: ; preds = %while.body		while.end.loopexit: ; preds = %while.body
ret void		ret void
}		}


		define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
		; CHECK-LABEL: @copy_stride4(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 4)
		; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
		; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
		; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
		; CHECK: vector.ph:
		; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
		; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
		; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
		; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]]
		; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
		; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
		; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
		; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
		; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i64> [[TMP8]], zeroinitializer
		; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
		; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
		; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
		; CHECK-NEXT: [[TMP13:%.*]] = mul i64 4, [[TMP12]]
		; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i32 0
		; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: br label %vector.body
		; CHECK: vector.body:
		; CHECK-NEXT: [[INDEX1:%.]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.]], %vector.body ]
		; CHECK-NEXT: [[VEC_IND:%.]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.]], %vector.body ]
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
		; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
		; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT4]], [[TMP15]]
		; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
		; CHECK-NEXT: [[TMP17:%.]] = getelementptr i32, i32 [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
		; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> undef)
		; CHECK-NEXT: [[TMP18:%.]] = getelementptr i32, i32 [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
		; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP16]])
		; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4
		; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]]
		; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
		; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
		; CHECK-NEXT: br i1 [[TMP21]], label %middle.block, label %vector.body
		; CHECK: middle.block:
		; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
		;
		entry:
		br label %while.body

		while.body: ; preds = %while.body, %entry
		%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
		%gep1 = getelementptr i32, i32* %src, i64 %index
		%val = load i32, i32* %gep1
		%gep2 = getelementptr i32, i32* %dst, i64 %index
		store i32 %val, i32* %gep2
		%index.next = add nsw i64 %index, 4
		%cmp10 = icmp ult i64 %index.next, %n
		br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0

		while.end.loopexit: ; preds = %while.body
		ret void
		}


define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 {		define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 {
; CHECK-LABEL: @simple_gather_scatter(		; CHECK-LABEL: @simple_gather_scatter(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)		; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)
; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:		; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()		; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4		; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	if.end: ; preds = %if.then, %for.body
%exitcond.not = icmp eq i64 %index.next, %n		%exitcond.not = icmp eq i64 %index.next, %n
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0		br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0

for.end: ; preds = %for.inc, %entry		for.end: ; preds = %for.inc, %entry
ret void		ret void
}		}


		; The original loop had an unconditional uniform store. Let's make sure
		; we don't artificially create new predicated blocks for the load.
		define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
		; CHECK-LABEL: @uniform_store(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
		; CHECK: vector.ph:
		; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
		; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
		; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
		; CHECK-NEXT: [[N_RND_UP:%.]] = add i64 [[N:%.]], [[TMP4]]
		; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
		; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
		; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.]] = insertelement <vscale x 4 x i32> poison, i32* [[DST:%.*]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT4:%.]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: br label %vector.body
		; CHECK: vector.body:
		; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.]], %vector.body ]
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
		; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i64> [[TMP5]], zeroinitializer
		; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
		; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT2]], [[TMP7]]
		; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
		; CHECK-NEXT: [[TMP9:%.*]] = icmp ule <vscale x 4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
		; CHECK-NEXT: [[TMP10:%.]] = getelementptr inbounds i32, i32 [[SRC:%.*]], i64 [[TMP8]]
		; CHECK-NEXT: [[TMP11:%.]] = getelementptr inbounds i32, i32 [[TMP10]], i32 0
		; CHECK-NEXT: [[TMP12:%.]] = bitcast i32 [[TMP11]] to <vscale x 4 x i32>*
		; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
		; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT4]], i32 4, <vscale x 4 x i1> [[TMP9]])
		; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
		; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
		; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
		; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body
		; CHECK: middle.block:
		; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph
		;

		entry:
		br label %for.body

		for.body: ; preds = %entry, %for.body
		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
		%arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
		%val = load i32, i32* %arrayidx, align 4
		store i32 %val, i32* %dst, align 4
		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
		%exitcond.not = icmp eq i64 %indvars.iv.next, %n
		br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0

		for.end: ; preds = %for.body, %entry
		ret void
		}


define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {		define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
; CHECK-LABEL: @simple_fdiv(		; CHECK-LABEL: @simple_fdiv(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)		; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)
; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:		; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()		; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4		; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
▲ Show 20 Lines • Show All 214 Lines • Show Last 20 Lines