Diff 503775

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Show First 20 Lines • Show All 117 Lines • ▼ Show 20 Lines	InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,		ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,		TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,		VectorType *SubTp,
ArrayRef<const Value *> Args = std::nullopt);		ArrayRef<const Value *> Args = std::nullopt);

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,		InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);		TTI::TargetCostKind CostKind);

		InstructionCost getInterleavedMemoryOpCost(
		unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
		Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
		bool UseMaskForCond = false, bool UseMaskForGaps = false);

InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,		InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,		const Value *Ptr, bool VariableMask,
Align Alignment,		Align Alignment,
TTI::TargetCostKind CostKind,		TTI::TargetCostKind CostKind,
const Instruction *I);		const Instruction *I);

InstructionCost getCastInstrCost(unsigned Opcode, Type Dst, Type Src,		InstructionCost getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
TTI::CastContextHint CCH,		TTI::CastContextHint CCH,
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines	unsigned getMaxInterleaveFactor(ElementCount VF) {
// Don't interleave if the loop has been vectorized with scalable vectors.		// Don't interleave if the loop has been vectorized with scalable vectors.
if (VF.isScalable())		if (VF.isScalable())
return 1;		return 1;
// If the loop will not be vectorized, don't interleave the loop.		// If the loop will not be vectorized, don't interleave the loop.
// Let regular unroll to unroll the loop.		// Let regular unroll to unroll the loop.
return VF.isScalar() ? 1 : ST->getMaxInterleaveFactor();		return VF.isScalar() ? 1 : ST->getMaxInterleaveFactor();
}		}

		bool enableInterleavedAccessVectorization() { return true; }

enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };		enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
unsigned getNumberOfRegisters(unsigned ClassID) const {		unsigned getNumberOfRegisters(unsigned ClassID) const {
switch (ClassID) {		switch (ClassID) {
case RISCVRegisterClass::GPRRC:		case RISCVRegisterClass::GPRRC:
// 31 = 32 GPR - x0 (zero register)		// 31 = 32 GPR - x0 (zero register)
// FIXME: Should we exclude fixed registers like SP, TP or GP?		// FIXME: Should we exclude fixed registers like SP, TP or GP?
return 31;		return 31;
case RISCVRegisterClass::FPRRC:		case RISCVRegisterClass::FPRRC:
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Show First 20 Lines • Show All 360 Lines • ▼ Show 20 Lines	RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
if (!isLegalMaskedLoadStore(Src, Alignment) \|\|		if (!isLegalMaskedLoadStore(Src, Alignment) \|\|
CostKind != TTI::TCK_RecipThroughput)		CostKind != TTI::TCK_RecipThroughput)
return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,		return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);		CostKind);

return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);		return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
}		}

		InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
		unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
		Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
		bool UseMaskForCond, bool UseMaskForGaps) {
		auto *FVTy = cast<FixedVectorType>(VecTy);
		InstructionCost MemCost =
		getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
		reamesUnsubmitted Done Reply Inline Actions This doesn't look right. We need to account for the cost of the actual memory op, plus the interweave cost (if any). At a minimum, we need to have the full cost of the wide memory op as a baseline. I can't imagine hardware with an optimized segment-2 which beats the cost of a normal load/store op of the same width. The only question left is whether we need to explicitly model the shuffle cost. Depending on the hardware, we may or may not have an optimized segment load/store. I think it's probably safest to cost model this as if we're going to do a wide load followed by a shuffle. We can reduce that cost if we have a target which a) actually has faster segment-2, and b) cares about the cost difference. reames: This doesn't look right. We need to account for the cost of the actual memory op, plus the…
		unsigned VF = FVTy->getNumElements() / Factor;

		// An interleaved load will look like this for Factor=3:
		// %wide.vec = load <12 x i32>, ptr %3, align 4
		// %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
		// %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
		// %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
		if (Opcode == Instruction::Load) {
		InstructionCost Cost = MemCost;
		for (unsigned Index : Indices) {
		FixedVectorType *SubVecTy =
		FixedVectorType::get(FVTy->getElementType(), VF);
		auto Mask = createStrideMask(Index, Factor, VF);
		InstructionCost ShuffleCost =
		getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
		CostKind, 0, nullptr, {});
		Cost += ShuffleCost;
		}
		return Cost;
		}

		// TODO: Model for NF > 2
		// We'll need to enhance getShuffleCost to model shuffles that are inserts and
		// extracts.
		// An interleaved store will look like
		// %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
		// %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
		// %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
		// %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
		// store <12 x i32> %interleaved.vec, ptr %10, align 4
		if (Factor != 2)
		return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
		Alignment, AddressSpace, CostKind,
		UseMaskForCond, UseMaskForGaps);

		assert(Opcode == Instruction::Store && "Opcode must be load or store");
		craig.topperUnsubmitted Done Reply Inline Actions "Opcode must be a store"? craig.topper: "Opcode must be a store"?
		// For an interleaving load of 2 vectors, we perform one large interleaving
		craig.topperUnsubmitted Done Reply Inline Actions load -> store? craig.topper: load -> store?
		// shuffle that goes into the wide store
		auto Mask = createInterleaveMask(VF, Factor);
		InstructionCost ShuffleCost =
		getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
		CostKind, 0, nullptr, {});
		return MemCost + ShuffleCost;
		}

InstructionCost RISCVTTIImpl::getGatherScatterOpCost(		InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
unsigned Opcode, Type DataTy, const Value Ptr, bool VariableMask,		unsigned Opcode, Type DataTy, const Value Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {		Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)		if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,		return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment, CostKind, I);		Alignment, CostKind, I);

if ((Opcode == Instruction::Load &&		if ((Opcode == Instruction::Load &&
▲ Show 20 Lines • Show All 1,146 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+zve32x,+zvl1024b -S \| FileCheck %s

				; This element type isn't a supported SEW so this shouldn't be interleaved
				define void @load_store_zve32x(ptr %p) {
				; CHECK-LABEL: @load_store_zve32x(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
				; CHECK-NEXT: [[Q0:%.]] = getelementptr i64, ptr [[P:%.]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4
				; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
				; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4
				; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2
				; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = shl i64 %i, 1
				%q0 = getelementptr i64, ptr %p, i64 %offset0
				%x0 = load i64, ptr %q0
				%y0 = add i64 %x0, 1
				store i64 %y0, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i64, ptr %p, i64 %offset1
				%x1 = load i64, ptr %q1
				%y1 = add i64 %x1, 2
				store i64 %y1, ptr %q1

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S \| FileCheck %s

				define void @load_store_factor2_i32(ptr %p) {
				; CHECK-LABEL: @load_store_factor2_i32(
				; CHECK-NEXT: entry:
				reamesUnsubmitted Done Reply Inline Actions Can you add a couple tests for SEW < 64 bits? Also, you should probably add an actual CostModel test rather than relying on indirectly testing this through the vectorizer. reames: Can you add a couple tests for SEW < 64 bits? Also, you should probably add an actual…
				; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
				; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
				; CHECK-NEXT: [[TMP2:%.]] = getelementptr i32, ptr [[P:%.]], i64 [[TMP1]]
				; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
				; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
				; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
				; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
				; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
				; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2>
				; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
				; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
				; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
				; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
				; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
				; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
				; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
				; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
				; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
				; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
				; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = shl i64 %i, 1
				%q0 = getelementptr i32, ptr %p, i64 %offset0
				%x0 = load i32, ptr %q0
				%y0 = add i32 %x0, 1
				store i32 %y0, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i32, ptr %p, i64 %offset1
				%x1 = load i32, ptr %q1
				%y1 = add i32 %x1, 2
				store i32 %y1, ptr %q1

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

				define void @load_store_factor2_i64(ptr %p) {
				; CHECK-LABEL: @load_store_factor2_i64(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
				; CHECK-NEXT: [[Q0:%.]] = getelementptr i64, ptr [[P:%.]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4
				; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
				; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4
				; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2
				; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = shl i64 %i, 1
				%q0 = getelementptr i64, ptr %p, i64 %offset0
				%x0 = load i64, ptr %q0
				%y0 = add i64 %x0, 1
				store i64 %y0, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i64, ptr %p, i64 %offset1
				%x1 = load i64, ptr %q1
				%y1 = add i64 %x1, 2
				store i64 %y1, ptr %q1

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

				define void @load_store_factor3_i32(ptr %p) {
				; CHECK-LABEL: @load_store_factor3_i32(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
				; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
				; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
				; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
				; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
				; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
				; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
				; CHECK-NEXT: [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
				; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
				; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
				; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
				; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_IND:%.]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: [[TMP11:%.]] = getelementptr i32, ptr [[P:%.]], <vscale x 2 x i64> [[TMP10]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
				; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
				; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
				; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i32> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 2, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
				; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
				; CHECK-NEXT: [[TMP18:%.*]] = add <vscale x 2 x i32> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 3, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
				; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
				; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
				; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
				; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
				; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
				; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3
				; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
				; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
				; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
				; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
				; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
				; CHECK-NEXT: [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
				; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
				; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4
				; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3
				; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = mul i64 %i, 3
				%q0 = getelementptr i32, ptr %p, i64 %offset0
				%x0 = load i32, ptr %q0
				%y0 = add i32 %x0, 1
				store i32 %y0, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i32, ptr %p, i64 %offset1
				%x1 = load i32, ptr %q1
				%y1 = add i32 %x1, 2
				store i32 %y1, ptr %q1

				%offset2 = add i64 %offset1, 1
				%q2 = getelementptr i32, ptr %p, i64 %offset2
				%x2 = load i32, ptr %q2
				%y2 = add i32 %x2, 3
				store i32 %y2, ptr %q2

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

				define void @load_store_factor3_i64(ptr %p) {
				; CHECK-LABEL: @load_store_factor3_i64(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3
				; CHECK-NEXT: [[Q0:%.]] = getelementptr i64, ptr [[P:%.]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4
				; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
				; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4
				; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2
				; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4
				; CHECK-NEXT: [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
				; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
				; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 4
				; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3
				; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = mul i64 %i, 3
				%q0 = getelementptr i64, ptr %p, i64 %offset0
				%x0 = load i64, ptr %q0
				%y0 = add i64 %x0, 1
				store i64 %y0, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i64, ptr %p, i64 %offset1
				%x1 = load i64, ptr %q1
				%y1 = add i64 %x1, 2
				store i64 %y1, ptr %q1

				%offset2 = add i64 %offset1, 1
				%q2 = getelementptr i64, ptr %p, i64 %offset2
				%x2 = load i64, ptr %q2
				%y2 = add i64 %x2, 3
				store i64 %y2, ptr %q2

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

				define void @load_store_factor8(ptr %p) {
				; CHECK-LABEL: @load_store_factor8(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3
				; CHECK-NEXT: [[Q0:%.]] = getelementptr i64, ptr [[P:%.]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4
				; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
				; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4
				; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2
				; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 4
				; CHECK-NEXT: [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
				; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
				; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 4
				; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3
				; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 4
				; CHECK-NEXT: [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
				; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
				; CHECK-NEXT: [[X3:%.*]] = load i64, ptr [[Q3]], align 4
				; CHECK-NEXT: [[Y3:%.*]] = add i64 [[X3]], 4
				; CHECK-NEXT: store i64 [[Y3]], ptr [[Q3]], align 4
				; CHECK-NEXT: [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
				; CHECK-NEXT: [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
				; CHECK-NEXT: [[X4:%.*]] = load i64, ptr [[Q4]], align 4
				; CHECK-NEXT: [[Y4:%.*]] = add i64 [[X4]], 5
				; CHECK-NEXT: store i64 [[Y4]], ptr [[Q4]], align 4
				; CHECK-NEXT: [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
				; CHECK-NEXT: [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
				; CHECK-NEXT: [[X5:%.*]] = load i64, ptr [[Q5]], align 4
				; CHECK-NEXT: [[Y5:%.*]] = add i64 [[X5]], 6
				; CHECK-NEXT: store i64 [[Y5]], ptr [[Q5]], align 4
				; CHECK-NEXT: [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
				; CHECK-NEXT: [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
				; CHECK-NEXT: [[X6:%.*]] = load i64, ptr [[Q6]], align 4
				; CHECK-NEXT: [[Y6:%.*]] = add i64 [[X6]], 7
				; CHECK-NEXT: store i64 [[Y6]], ptr [[Q6]], align 4
				; CHECK-NEXT: [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
				; CHECK-NEXT: [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
				; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 4
				; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8
				; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = shl i64 %i, 3
				%q0 = getelementptr i64, ptr %p, i64 %offset0
				%x0 = load i64, ptr %q0
				%y0 = add i64 %x0, 1
				store i64 %y0, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i64, ptr %p, i64 %offset1
				%x1 = load i64, ptr %q1
				%y1 = add i64 %x1, 2
				store i64 %y1, ptr %q1

				%offset2 = add i64 %offset1, 1
				%q2 = getelementptr i64, ptr %p, i64 %offset2
				%x2 = load i64, ptr %q2
				%y2 = add i64 %x2, 3
				store i64 %y2, ptr %q2

				%offset3 = add i64 %offset2, 1
				%q3 = getelementptr i64, ptr %p, i64 %offset3
				%x3 = load i64, ptr %q3
				%y3 = add i64 %x3, 4
				store i64 %y3, ptr %q3

				%offset4 = add i64 %offset3, 1
				%q4 = getelementptr i64, ptr %p, i64 %offset4
				%x4 = load i64, ptr %q4
				%y4 = add i64 %x4, 5
				store i64 %y4, ptr %q4

				%offset5 = add i64 %offset4, 1
				%q5 = getelementptr i64, ptr %p, i64 %offset5
				%x5 = load i64, ptr %q5
				%y5 = add i64 %x5, 6
				store i64 %y5, ptr %q5

				%offset6 = add i64 %offset5, 1
				%q6 = getelementptr i64, ptr %p, i64 %offset6
				%x6 = load i64, ptr %q6
				%y6 = add i64 %x6, 7
				store i64 %y6, ptr %q6

				%offset7 = add i64 %offset6, 1
				%q7 = getelementptr i64, ptr %p, i64 %offset7
				%x7 = load i64, ptr %q7
				%y7 = add i64 %x7, 8
				store i64 %y7, ptr %q7

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

				define void @combine_load_factor2_i32(ptr %p) {
				; CHECK-LABEL: @combine_load_factor2_i32(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[VEC_IND:%.]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
				; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
				; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[STEP_ADD]], <i64 1, i64 1, i64 1, i64 1>
				; CHECK-NEXT: [[TMP2:%.]] = getelementptr i32, ptr [[P:%.]], <4 x i64> [[TMP0]]
				; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], <4 x i64> [[TMP1]]
				; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP2]], i32 0
				; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0
				; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
				; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
				; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
				; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
				; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]]
				; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC5]]
				; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP8]], <4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
				; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
				; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
				; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
				; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
				; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
				; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
				; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]]
				; CHECK-NEXT: store i32 [[RES]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = shl i64 %i, 1
				%q0 = getelementptr i32, ptr %p, i64 %offset0
				%x0 = load i32, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i32, ptr %p, i64 %offset1
				%x1 = load i32, ptr %q1

				%res = add i32 %x0, %x1

				store i32 %res, ptr %q0

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

				define void @combine_load_factor2_i64(ptr %p) {
				; CHECK-LABEL: @combine_load_factor2_i64(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[I:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
				; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
				; CHECK-NEXT: [[Q0:%.]] = getelementptr i64, ptr [[P:%.]], i64 [[OFFSET0]]
				; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 4
				; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
				; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
				; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 4
				; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]]
				; CHECK-NEXT: store i64 [[RES]], ptr [[Q0]], align 4
				; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
				; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
				; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop
				loop:
				%i = phi i64 [0, %entry], [%nexti, %loop]

				%offset0 = shl i64 %i, 1
				%q0 = getelementptr i64, ptr %p, i64 %offset0
				%x0 = load i64, ptr %q0

				%offset1 = add i64 %offset0, 1
				%q1 = getelementptr i64, ptr %p, i64 %offset1
				%x1 = load i64, ptr %q1

				%res = add i64 %x0, %x1

				store i64 %res, ptr %q0

				%nexti = add i64 %i, 1
				%done = icmp eq i64 %nexti, 1024
				br i1 %done, label %exit, label %loop
				exit:
				ret void
				}

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll

This file was added.

				; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 \| FileCheck %s --check-prefix=VF_2
				; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 \| FileCheck %s --check-prefix=VF_4
				lukeAuthorUnsubmitted Done Reply Inline Actions Unfortunately there doesn't seem to be a way to print out the result of `getInterleavedMemoryOpCost` other than via -debug-only=loop-vectorize. i.e. It's not called in getInstructionCost luke: Unfortunately there doesn't seem to be a way to print out the result of…
				; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 \| FileCheck %s --check-prefix=VF_8
				; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 \| FileCheck %s --check-prefix=VF_16

				%i8.2 = type {i8, i8}
				define void @i8_factor_2(ptr %data, i64 %n) {
				entry:
				br label %for.body
				; VF_2-LABEL: Checking a loop in 'i8_factor_2'
				; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %l1 = load i8, ptr %p1, align 1
				lukeAuthorUnsubmitted Done Reply Inline Actions @reames Apologies for jumping the gun there, this is the line that fails without that change to `getShuffleCost`. I've split it out into D146176, but the gist is that this deinterleaved load ends up getting costed as an interleave because it has a mask of `<0, 2>` This test case covers it but I wasn't able to find a way to test it in the aforementioned patch due to how getInstructionCost doesn't cost shuffles that change length. (Happy to take a look into that though if needed) luke: @reames Apologies for jumping the gun there, this is the line that fails without that change to…
				; VF_2: Found an estimated cost of 0 for VF 2 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_4-LABEL: Checking a loop in 'i8_factor_2'
				; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_4: Found an estimated cost of 0 for VF 4 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_8-LABEL: Checking a loop in 'i8_factor_2'
				; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_8: Found an estimated cost of 0 for VF 8 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_16-LABEL: Checking a loop in 'i8_factor_2'
				; VF_16: Found an estimated cost of 3 for VF 16 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_16: Found an estimated cost of 0 for VF 16 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_16-NEXT: Found an estimated cost of 5 for VF 16 For instruction: store i8 %a1, ptr %p1, align 1
				for.body:
				%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
				%p0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
				%p1 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 1
				%l0 = load i8, ptr %p0, align 1
				%l1 = load i8, ptr %p1, align 1
				%a0 = add i8 %l0, 1
				%a1 = add i8 %l1, 2
				store i8 %a0, ptr %p0, align 1
				store i8 %a1, ptr %p1, align 1
				%i.next = add nuw nsw i64 %i, 1
				%cond = icmp slt i64 %i.next, %n
				br i1 %cond, label %for.body, label %for.end

				for.end:
				ret void
				}

				%i8.3 = type {i8, i8, i8}
				define void @i8_factor_3(ptr %data, i64 %n) {
				entry:
				br label %for.body
				; VF_2-LABEL: Checking a loop in 'i8_factor_3'
				; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %l2 = load i8, ptr %p2, align 1
				; VF_2: Found an estimated cost of 0 for VF 2 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_2: Found an estimated cost of 0 for VF 2 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store i8 %a2, ptr %p2, align 1
				; VF_4-LABEL: Checking a loop in 'i8_factor_3'
				; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %l2 = load i8, ptr %p2, align 1
				; VF_4: Found an estimated cost of 0 for VF 4 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_4: Found an estimated cost of 0 for VF 4 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store i8 %a2, ptr %p2, align 1
				; VF_8-LABEL: Checking a loop in 'i8_factor_3'
				; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %l2 = load i8, ptr %p2, align 1
				; VF_8: Found an estimated cost of 0 for VF 8 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_8: Found an estimated cost of 0 for VF 8 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store i8 %a2, ptr %p2, align 1
				; VF_16-LABEL: Checking a loop in 'i8_factor_3'
				; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %l0 = load i8, ptr %p0, align 1
				; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %l1 = load i8, ptr %p1, align 1
				; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %l2 = load i8, ptr %p2, align 1
				; VF_16: Found an estimated cost of 0 for VF 16 For instruction: store i8 %a0, ptr %p0, align 1
				; VF_16: Found an estimated cost of 0 for VF 16 For instruction: store i8 %a1, ptr %p1, align 1
				; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store i8 %a2, ptr %p2, align 1
				for.body:
				%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
				%p0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
				%p1 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 1
				%p2 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 2
				%l0 = load i8, ptr %p0, align 1
				%l1 = load i8, ptr %p1, align 1
				%l2 = load i8, ptr %p2, align 1
				%a0 = add i8 %l0, 1
				%a1 = add i8 %l1, 2
				%a2 = add i8 %l2, 3
				store i8 %a0, ptr %p0, align 1
				store i8 %a1, ptr %p1, align 1
				store i8 %a2, ptr %p2, align 1
				%i.next = add nuw nsw i64 %i, 1
				%cond = icmp slt i64 %i.next, %n
				br i1 %cond, label %for.body, label %for.end

				for.end:
				ret void
				}

llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple riscv64-linux-gnu -mattr=+zve32f,+f -S 2>%t \| FileCheck %s			; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple riscv64-linux-gnu -mattr=+zve32f,+f -S 2>%t \| FileCheck %s

	target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"			target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
	target triple = "riscv64"			target triple = "riscv64"

	; We can't use scalable vectorization for Zvl32b due to RVVBitsPerBlock being			; We can't use scalable vectorization for Zvl32b due to RVVBitsPerBlock being
	; 64. Since our vscale value is vlen/RVVBitsPerBlock this makes vscale 0.			; 64. Since our vscale value is vlen/RVVBitsPerBlock this makes vscale 0.
	; Make sure we fall back to fixed vectorization instead.			; Make sure we fall back to fixed vectorization instead.
	define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {			define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
	; CHECK-LABEL: @vector_add_i16(			; CHECK-LABEL: @vector_add_i16(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; CHECK: vector.ph:			; CHECK: vector.ph:
	; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <2 x i16> poison, i16 [[V:%.]], i64 0			; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <2 x i16> poison, i16 [[V:%.]], i64 0
	; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer			; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
	; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i16> poison, i16 [[V]], i64 0			; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i16> poison, i16 [[V]], i64 0
	; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT3]], <2 x i16> poison, <2 x i32> zeroinitializer			; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT4]], <2 x i16> poison, <2 x i32> zeroinitializer
	; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]			; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
	; CHECK: vector.body:			; CHECK: vector.body:
	; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[VEC_IND:%.]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[VEC_IND:%.]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>			; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
	; CHECK-NEXT: [[TMP0:%.]] = getelementptr inbounds i32, ptr [[A:%.]], <2 x i64> [[VEC_IND]]			; CHECK-NEXT: [[TMP0:%.]] = getelementptr inbounds i32, ptr [[A:%.]], <2 x i64> [[VEC_IND]]
	; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], <2 x i64> [[STEP_ADD]]			; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], <2 x i64> [[STEP_ADD]]
	; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP0]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
	; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP1]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)			; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0
	; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i16> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]			; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
	; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[WIDE_MASKED_GATHER2]], [[BROADCAST_SPLAT4]]			; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
	; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP2]], <2 x ptr> [[TMP0]], i32 2, <2 x i1> <i1 true, i1 true>)			; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
	; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP3]], <2 x ptr> [[TMP1]], i32 2, <2 x i1> <i1 true, i1 true>)			; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i16> [[WIDE_VEC2]], <4 x i16> poison, <2 x i32> <i32 0, i32 2>
				; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i16> [[STRIDED_VEC3]], [[BROADCAST_SPLAT5]]
				; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP6]], <2 x ptr> [[TMP0]], i32 2, <2 x i1> <i1 true, i1 true>)
				; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP7]], <2 x ptr> [[TMP1]], i32 2, <2 x i1> <i1 true, i1 true>)
	; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4			; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
	; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>			; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
	; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024			; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
	; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]			; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; CHECK: middle.block:			; CHECK: middle.block:
	; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024			; CHECK-NEXT: br label [[SCALAR_PH]]
	; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; CHECK: scalar.ph:			; CHECK: scalar.ph:
	; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ 1020, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]			; CHECK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]
	; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]			; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
	; CHECK-NEXT: [[ELEM:%.*]] = load i16, ptr [[ARRAYIDX]], align 2			; CHECK-NEXT: [[ELEM:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
	; CHECK-NEXT: [[ADD:%.*]] = add i16 [[ELEM]], [[V]]			; CHECK-NEXT: [[ADD:%.*]] = add i16 [[ELEM]], [[V]]
	; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX]], align 2			; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX]], align 2
	; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1			; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
	; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024			; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
	; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]			; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
	; CHECK: for.end:			; CHECK: for.end:
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	br label %for.body			br label %for.body

	for.body:			for.body:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]			%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
	Show All 11 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Enable interleaved access vectorization
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 503775

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll

llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll

This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Enable interleaved access vectorizationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 503775

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll

llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll

[RISCV] Enable interleaved access vectorization
ClosedPublic