This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors
ClosedPublic

Authored by david-arm on Nov 4 2021, 4:01 AM.

Download Raw Diff

Details

Reviewers

sdesmalen
kmclaughlin
peterwaller-arm
fhahn

Commits

rGb0922a9dcd11: [LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors

Summary

The code in VPWidenCanonicalIVRecipe::execute only worked for fixed-width
vectors due to the way we generate the values per lane. This patch changes
the code to use a combination of vector splats and step vectors to get
the same result. This then works for both fixed-width and scalable vectors.

Tests that exercise this code path for scalable vectors have been added here:

Transforms/LoopVectorize/AArch64/sve-tail-folding.ll

Diff Detail

Unit TestsFailed

	Time	Test
	1,320 ms	x64 debian > libomp.tasking::omp_task_red_taskloop.c

Event Timeline

david-arm created this revision.Nov 4 2021, 4:01 AM

Herald added subscribers: ctetreau, rogfer01, hiraditya, kristof.beyls. · View Herald TranscriptNov 4 2021, 4:01 AM

david-arm requested review of this revision.Nov 4 2021, 4:01 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 4 2021, 4:01 AM

Herald added subscribers: llvm-commits, vkmr. · View Herald Transcript

david-arm added a parent revision: D113003: [LoopVectorize] Add support for tail folding using scalable vectors.Nov 4 2021, 4:02 AM

Harbormaster completed remote builds in B132416: Diff 384698.Nov 4 2021, 4:02 AM

david-arm added a parent revision: D113122: [NFC][LoopVectorize] Make the createStepForVF interface more caller-friendly.Nov 4 2021, 4:02 AM

LGTM!

This revision is now accepted and ready to land.Nov 10 2021, 5:28 AM

Matt added a subscriber: Matt.Nov 17 2021, 12:03 PM

Rebase.

Harbormaster completed remote builds in B137837: Diff 392299.Dec 7 2021, 1:25 AM

This revision was landed with ongoing or failed builds.Jan 10 2022, 6:12 AM

Closed by commit rGb0922a9dcd11: [LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors (authored by david-arm). · Explain Why

This revision was automatically updated to reflect the committed changes.

david-arm added a commit: rGb0922a9dcd11: [LoopVectorize] Make VPWidenCanonicalIVRecipe::execute work for scalable vectors.

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

LoopVectorize.cpp

16 lines

VPlan.h

4 lines

VPlan.cpp

18 lines

test/

Transforms/

LoopVectorize/

AArch64/

sve-tail-folding.ll

133 lines

Diff 392299

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,097 Lines • ▼ Show 20 Lines	if (I) {
// using the loop's.		// using the loop's.
if (I->getDebugLoc())		if (I->getDebugLoc())
DL = I->getDebugLoc();		DL = I->getDebugLoc();
}		}

return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);		return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}		}

/// Return a value for Step multiplied by VF.
static Value createStepForVF(IRBuilder<> &B, Type Ty, ElementCount VF,
int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
Constant StepVal = ConstantInt::get(Ty, Step VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
}

namespace llvm {		namespace llvm {

/// Return the runtime value for VF.		/// Return the runtime value for VF.
Value getRuntimeVF(IRBuilder<> &B, Type Ty, ElementCount VF) {		Value getRuntimeVF(IRBuilder<> &B, Type Ty, ElementCount VF) {
Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());		Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(EC) : EC;		return VF.isScalable() ? B.CreateVScale(EC) : EC;
}		}

		/// Return a value for Step multiplied by VF.
		Value createStepForVF(IRBuilder<> &B, Type Ty, ElementCount VF,
		int64_t Step) {
		assert(Ty->isIntegerTy() && "Expected an integer step");
		Constant StepVal = ConstantInt::get(Ty, Step VF.getKnownMinValue());
		return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
		}

static Value getRuntimeVFAsFloat(IRBuilder<> &B, Type FTy, ElementCount VF) {		static Value getRuntimeVFAsFloat(IRBuilder<> &B, Type FTy, ElementCount VF) {
assert(FTy->isFloatingPointTy() && "Expected floating point type!");		assert(FTy->isFloatingPointTy() && "Expected floating point type!");
Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());		Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);		Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
return B.CreateUIToFP(RuntimeVF, FTy);		return B.CreateUIToFP(RuntimeVF, FTy);
}		}

void reportVectorizationFailure(const StringRef DebugMsg,		void reportVectorizationFailure(const StringRef DebugMsg,
▲ Show 20 Lines • Show All 9,659 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlan.h

	Show First 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
	class VPReplicateRecipe;			class VPReplicateRecipe;
	class VPlanSlp;			class VPlanSlp;

	/// Returns a calculation for the total number of elements for a given \p VF.			/// Returns a calculation for the total number of elements for a given \p VF.
	/// For fixed width vectors this value is a constant, whereas for scalable			/// For fixed width vectors this value is a constant, whereas for scalable
	/// vectors it is an expression determined at runtime.			/// vectors it is an expression determined at runtime.
	Value getRuntimeVF(IRBuilder<> &B, Type Ty, ElementCount VF);			Value getRuntimeVF(IRBuilder<> &B, Type Ty, ElementCount VF);

				/// Similar to getRuntimeVF above, except it returns the total number of
				/// elements for a given \p VF, multiplied by Step.
				Value createStepForVF(IRBuilder<> &B, Type Ty, ElementCount VF, int64_t Step);

	/// A range of powers-of-2 vectorization factors with fixed start and			/// A range of powers-of-2 vectorization factors with fixed start and
	/// adjustable end. The range includes start and excludes end, e.g.,:			/// adjustable end. The range includes start and excludes end, e.g.,:
	/// [1, 9) = {1, 2, 4, 8}			/// [1, 9) = {1, 2, 4, 8}
	struct VFRange {			struct VFRange {
	// A power of 2.			// A power of 2.
	const ElementCount Start;			const ElementCount Start;

	// Need not be a power of 2. If End <= Start range is empty.			// Need not be a power of 2. If End <= Start range is empty.
	▲ Show 20 Lines • Show All 2,511 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlan.cpp

	Show First 20 Lines • Show All 1,258 Lines • ▼ Show 20 Lines
	}			}
	#endif			#endif

	void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {			void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
	Value *CanonicalIV = State.CanonicalIV;			Value *CanonicalIV = State.CanonicalIV;
	Type *STy = CanonicalIV->getType();			Type *STy = CanonicalIV->getType();
	IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());			IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
	ElementCount VF = State.VF;			ElementCount VF = State.VF;
	assert(!VF.isScalable() && "the code following assumes non scalables ECs");
	Value *VStart = VF.isScalar()			Value *VStart = VF.isScalar()
	? CanonicalIV			? CanonicalIV
	: Builder.CreateVectorSplat(VF.getKnownMinValue(),			: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
	CanonicalIV, "broadcast");
	for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {			for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
	SmallVector<Constant *, 8> Indices;			Value *VStep = createStepForVF(Builder, STy, VF, Part);
	for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)			if (VF.isVector()) {
	Indices.push_back(			VStep = Builder.CreateVectorSplat(VF, VStep);
	ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));			VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
				Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); + VStep = + Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); Lint: Pre-merge checks: clang-format: please reformat the code ``` - VStep = Builder.CreateAdd(VStep, Builder.
	// If VF == 1, there is only one iteration in the loop above, thus the			}
	// element pushed back into Indices is ConstantInt::get(STy, Part)
	Constant *VStep =
	VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
	// Add the consecutive indices to the vector value.
	Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");			Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
	State.set(this, CanonicalVectorIV, Part);			State.set(this, CanonicalVectorIV, Part);
	}			}
	}			}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)			#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,			void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
	VPSlotTracker &SlotTracker) const {			VPSlotTracker &SlotTracker) const {
	▲ Show 20 Lines • Show All 227 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll

Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	while.body: ; preds = %while.body, %entry
%cmp10 = icmp ult i64 %index.next, %n		%cmp10 = icmp ult i64 %index.next, %n
br i1 %cmp10, label %while.body, label %while.end.loopexit		br i1 %cmp10, label %while.body, label %while.end.loopexit

while.end.loopexit: ; preds = %while.body		while.end.loopexit: ; preds = %while.body
ret void		ret void
}		}


		define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
		; CHECK-LABEL: @copy_stride4(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 4)
		; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
		; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
		; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
		; CHECK: vector.ph:
		; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
		; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
		; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
		; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]]
		; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
		; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
		; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
		; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
		; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i64> [[TMP8]], zeroinitializer
		; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
		; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
		; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
		; CHECK-NEXT: [[TMP13:%.*]] = mul i64 4, [[TMP12]]
		; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i32 0
		; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: br label %vector.body
		; CHECK: vector.body:
		; CHECK-NEXT: [[INDEX1:%.]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.]], %vector.body ]
		; CHECK-NEXT: [[VEC_IND:%.]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.]], %vector.body ]
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
		; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
		; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT4]], [[TMP15]]
		; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
		; CHECK-NEXT: [[TMP17:%.]] = getelementptr i32, i32 [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
		; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> undef)
		; CHECK-NEXT: [[TMP18:%.]] = getelementptr i32, i32 [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
		; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP16]])
		; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4
		; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]]
		; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
		; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
		; CHECK-NEXT: br i1 [[TMP21]], label %middle.block, label %vector.body
		; CHECK: middle.block:
		; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
		;
		entry:
		br label %while.body

		while.body: ; preds = %while.body, %entry
		%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
		%gep1 = getelementptr i32, i32* %src, i64 %index
		%val = load i32, i32* %gep1
		%gep2 = getelementptr i32, i32* %dst, i64 %index
		store i32 %val, i32* %gep2
		%index.next = add nsw i64 %index, 4
		%cmp10 = icmp ult i64 %index.next, %n
		br i1 %cmp10, label %while.body, label %while.end.loopexit

		while.end.loopexit: ; preds = %while.body
		ret void
		}


define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 {		define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 {
; CHECK-LABEL: @simple_gather_scatter(		; CHECK-LABEL: @simple_gather_scatter(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)		; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)
; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:		; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()		; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4		; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	if.end: ; preds = %if.then, %for.body
%exitcond.not = icmp eq i64 %index.next, %n		%exitcond.not = icmp eq i64 %index.next, %n
br i1 %exitcond.not, label %for.end, label %for.body		br i1 %exitcond.not, label %for.end, label %for.body

for.end: ; preds = %for.inc, %entry		for.end: ; preds = %for.inc, %entry
ret void		ret void
}		}


		; The original loop had an unconditional uniform store. Let's make sure
		; we don't artificially create new predicated blocks for the load.
		define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
		; CHECK-LABEL: @uniform_store(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
		; CHECK: vector.ph:
		; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
		; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
		; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
		; CHECK-NEXT: [[N_RND_UP:%.]] = add i64 [[N:%.]], [[TMP4]]
		; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
		; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
		; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.]] = insertelement <vscale x 4 x i32> poison, i32* [[DST:%.*]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT4:%.]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: br label %vector.body
		; CHECK: vector.body:
		; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.]], %vector.body ]
		; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i32 0
		; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
		; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
		; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 4 x i64> [[TMP5]], zeroinitializer
		; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
		; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT2]], [[TMP7]]
		; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
		; CHECK-NEXT: [[TMP9:%.*]] = icmp ule <vscale x 4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
		; CHECK-NEXT: [[TMP10:%.]] = getelementptr inbounds i32, i32 [[SRC:%.*]], i64 [[TMP8]]
		; CHECK-NEXT: [[TMP11:%.]] = getelementptr inbounds i32, i32 [[TMP10]], i32 0
		; CHECK-NEXT: [[TMP12:%.]] = bitcast i32 [[TMP11]] to <vscale x 4 x i32>*
		; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
		; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT4]], i32 4, <vscale x 4 x i1> [[TMP9]])
		; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
		; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
		; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
		; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
		; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body
		; CHECK: middle.block:
		; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph
		;

		entry:
		br label %for.body

		for.body: ; preds = %entry, %for.body
		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
		%arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
		%val = load i32, i32* %arrayidx, align 4
		store i32 %val, i32* %dst, align 4
		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
		%exitcond.not = icmp eq i64 %indvars.iv.next, %n
		br i1 %exitcond.not, label %for.end, label %for.body

		for.end: ; preds = %for.body, %entry
		ret void
		}


define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {		define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
; CHECK-LABEL: @simple_fdiv(		; CHECK-LABEL: @simple_fdiv(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)		; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)
; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph		; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:		; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()		; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4		; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
▲ Show 20 Lines • Show All 211 Lines • Show Last 20 Lines