This is an archive of the discontinued LLVM Phabricator instance.

[LV][RISCV] Don't interleave scalable vector loops
ClosedPublic

Authored by luke on Feb 21 2023, 7:06 AM.

Download Raw Diff

Details

Reviewers

fhahn
loralb
craig.topper
reames

Commits

rG15f9cf164c00: [LV][RISCV] Don't interleave scalable vector loops

Summary

It's less clear with scalable vectors than fixed length vectors that
interleaving exposes more ILP, as scalable vectors can be thought of a
sort of hardware form of interleaving, especially with larger LMULs.
This also addresses the unexpected additional unrolling that occurs when
using larger LMULs in the loop vectorizer.

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	60,060 ms	x64 debian > libFuzzer.libFuzzer::fuzzer-leak.test
	60,040 ms	x64 debian > libFuzzer.libFuzzer::value-profile-load.test

Event Timeline

luke created this revision.Feb 21 2023, 7:06 AM

Herald added a project: Restricted Project. · View Herald TranscriptFeb 21 2023, 7:06 AM

Herald added subscribers: asb, pmatos, VincentWu and 30 others. · View Herald Transcript

luke requested review of this revision.Feb 21 2023, 7:06 AM

Herald added a project: Restricted Project. · View Herald TranscriptFeb 21 2023, 7:06 AM

Herald added subscribers: llvm-commits, • pcwang-thead, alextsao1999 and 2 others. · View Herald Transcript

luke added a parent revision: D144474: [LV][NFC] Use ElementCount for getMaxInterleaveFactor.Feb 21 2023, 7:06 AM

Harbormaster completed remote builds in B215013: Diff 499160.Feb 21 2023, 8:00 AM

LGTM

I'm not sure we should leave this on for fixed vectors either once we go to LMUL=2.

This revision is now accepted and ready to land.Feb 21 2023, 10:46 AM

Fix scalable-reductions.ll test

Harbormaster completed remote builds in B215121: Diff 499313.Feb 21 2023, 4:59 PM

This revision was landed with ongoing or failed builds.Feb 22 2023, 2:15 AM

Closed by commit rG15f9cf164c00: [LV][RISCV] Don't interleave scalable vector loops (authored by luke). · Explain Why

This revision was automatically updated to reflect the committed changes.

luke added a commit: rG15f9cf164c00: [LV][RISCV] Don't interleave scalable vector loops.

In D144485#4142300, @craig.topper wrote:

I'm not sure we should leave this on for fixed vectors either once we go to LMUL=2.

I think I agree with this point. In a follow on patch, we should probably revisit this part of the heuristic as well.

A more general framing here is that interleave and LMUL both increase register pressure and tail effects, and that as we increase one we probably need to decrease the other by a roughly equal ratio.

Revision Contents

Path

Size

llvm/

lib/

Target/

RISCV/

RISCVTargetTransformInfo.h

5 lines

test/

Transforms/

LoopVectorize/

RISCV/

51 lines

101 lines

117 lines

58 lines

riscv-vector-reverse.ll

4 lines

scalable-basics.ll

82 lines

scalable-reductions.ll

3 lines

Diff 499313

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Show First 20 Lines • Show All 265 Lines • ▼ Show 20 Lines	bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
case RecurKind::FMulAdd:		case RecurKind::FMulAdd:
return true;		return true;
default:		default:
return false;		return false;
}		}
}		}

unsigned getMaxInterleaveFactor(ElementCount VF) {		unsigned getMaxInterleaveFactor(ElementCount VF) {
		// Don't interleave if the loop has been vectorized with scalable vectors.
		if (VF.isScalable())
		return 1;
// If the loop will not be vectorized, don't interleave the loop.		// If the loop will not be vectorized, don't interleave the loop.
// Let regular unroll to unroll the loop.		// Let regular unroll to unroll the loop.
return VF.getKnownMinValue() == 1 ? 1 : ST->getMaxInterleaveFactor();		return VF.isScalar() ? 1 : ST->getMaxInterleaveFactor();
}		}

enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };		enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
unsigned getNumberOfRegisters(unsigned ClassID) const {		unsigned getNumberOfRegisters(unsigned ClassID) const {
switch (ClassID) {		switch (ClassID) {
case RISCVRegisterClass::GPRRC:		case RISCVRegisterClass::GPRRC:
// 31 = 32 GPR - x0 (zero register)		// 31 = 32 GPR - x0 (zero register)
// FIXME: Should we exclude fixed registers like SP, TP or GP?		// FIXME: Should we exclude fixed registers like SP, TP or GP?
▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll

	Show First 20 Lines • Show All 939 Lines • ▼ Show 20 Lines
	for.end:			for.end:
	ret void			ret void
	}			}

	define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {			define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
	; CHECK-LABEL: @predicated_sdiv_by_minus_one(			; CHECK-LABEL: @predicated_sdiv_by_minus_one(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16			; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
	; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; CHECK: vector.ph:			; CHECK: vector.ph:
	; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16			; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
	; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]			; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
	; CHECK: vector.body:			; CHECK: vector.body:
	; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0			; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
	; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()			; CHECK-NEXT: [[TMP5:%.]] = getelementptr inbounds i8, ptr [[A:%.]], i64 [[TMP4]]
	; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8			; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
	; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0			; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
	; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1			; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <vscale x 8 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -128, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]			; CHECK-NEXT: [[TMP8:%.*]] = select <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP10:%.]] = getelementptr inbounds i8, ptr [[A:%.]], i64 [[TMP4]]			; CHECK-NEXT: [[TMP9:%.*]] = sdiv <vscale x 8 x i8> [[WIDE_LOAD]], [[TMP8]]
	; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]			; CHECK-NEXT: [[TMP10:%.*]] = xor <vscale x 8 x i1> [[TMP7]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0			; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> [[TMP9]], <vscale x 8 x i8> [[WIDE_LOAD]]
	; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1			; CHECK-NEXT: store <vscale x 8 x i8> [[PREDPHI]], ptr [[TMP6]], align 1
	; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
	; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8			; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
	; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP14]]			; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
	; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1			; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <vscale x 8 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -128, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)			; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
	; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <vscale x 8 x i8> [[WIDE_LOAD1]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -128, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP18:%.*]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP19:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 -1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP20:%.*]] = sdiv <vscale x 8 x i8> [[WIDE_LOAD]], [[TMP18]]
	; CHECK-NEXT: [[TMP21:%.*]] = sdiv <vscale x 8 x i8> [[WIDE_LOAD1]], [[TMP19]]
	; CHECK-NEXT: [[TMP22:%.*]] = xor <vscale x 8 x i1> [[TMP16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 8 x i1> [[TMP17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
	; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i8> [[TMP20]], <vscale x 8 x i8> [[WIDE_LOAD]]
	; CHECK-NEXT: [[PREDPHI2:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i8> [[TMP21]], <vscale x 8 x i8> [[WIDE_LOAD1]]
	; CHECK-NEXT: store <vscale x 8 x i8> [[PREDPHI]], ptr [[TMP12]], align 1
	; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
	; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 8
	; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP25]]
	; CHECK-NEXT: store <vscale x 8 x i8> [[PREDPHI2]], ptr [[TMP26]], align 1
	; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
	; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16
	; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]]
	; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
	; CHECK: middle.block:			; CHECK: middle.block:
	; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; CHECK: scalar.ph:			; CHECK: scalar.ph:
	; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; CHECK-NEXT: br label [[FOR_BODY:%.*]]			; CHECK-NEXT: br label [[FOR_BODY:%.*]]
	; CHECK: for.body:			; CHECK: for.body:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[LATCH:%.*]] ]			; CHECK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[LATCH:%.*]] ]
	▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - \| FileCheck %s -check-prefix=OUTLOOP			; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - \| FileCheck %s -check-prefix=OUTLOOP
	; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - \| FileCheck %s -check-prefix=INLOOP			; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - \| FileCheck %s -check-prefix=INLOOP


	target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"			target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
	target triple = "riscv64"			target triple = "riscv64"

	define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {			define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
	; OUTLOOP-LABEL: @add_i16_i32(			; OUTLOOP-LABEL: @add_i16_i32(
	; OUTLOOP-NEXT: entry:			; OUTLOOP-NEXT: entry:
	; OUTLOOP-NEXT: [[CMP6:%.]] = icmp sgt i32 [[N:%.]], 0			; OUTLOOP-NEXT: [[CMP6:%.]] = icmp sgt i32 [[N:%.]], 0
	; OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]			; OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
	; OUTLOOP: for.body.preheader:			; OUTLOOP: for.body.preheader:
	; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()			; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
	; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4			; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2
	; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]			; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
	; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; OUTLOOP: vector.ph:			; OUTLOOP: vector.ph:
	; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()			; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
	; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4			; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2
	; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]			; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
	; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]			; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
	; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]			; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
	; OUTLOOP: vector.body:			; OUTLOOP: vector.body:
	; OUTLOOP-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; OUTLOOP-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; OUTLOOP-NEXT: [[VEC_PHI:%.]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.]], [[VECTOR_BODY]] ]			; OUTLOOP-NEXT: [[VEC_PHI:%.]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.]], [[VECTOR_BODY]] ]
	; OUTLOOP-NEXT: [[VEC_PHI1:%.]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.]], [[VECTOR_BODY]] ]
	; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0			; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
	; OUTLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()			; OUTLOOP-NEXT: [[TMP5:%.]] = getelementptr inbounds i16, ptr [[X:%.]], i32 [[TMP4]]
	; OUTLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 2			; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
	; OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0			; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP6]], align 2
	; OUTLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1			; OUTLOOP-NEXT: [[TMP7:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i32>
	; OUTLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]]			; OUTLOOP-NEXT: [[TMP8]] = add <vscale x 2 x i32> [[VEC_PHI]], [[TMP7]]
	; OUTLOOP-NEXT: [[TMP10:%.]] = getelementptr inbounds i16, ptr [[X:%.]], i32 [[TMP4]]			; OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
	; OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]]			; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2
	; OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0			; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]]
	; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP12]], align 2			; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
	; OUTLOOP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; OUTLOOP-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
	; OUTLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP14]]
	; OUTLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i16>, ptr [[TMP15]], align 2
	; OUTLOOP-NEXT: [[TMP16:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i32>
	; OUTLOOP-NEXT: [[TMP17:%.*]] = sext <vscale x 2 x i16> [[WIDE_LOAD2]] to <vscale x 2 x i32>
	; OUTLOOP-NEXT: [[TMP18]] = add <vscale x 2 x i32> [[VEC_PHI]], [[TMP16]]
	; OUTLOOP-NEXT: [[TMP19]] = add <vscale x 2 x i32> [[VEC_PHI1]], [[TMP17]]
	; OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
	; OUTLOOP-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 4
	; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP21]]
	; OUTLOOP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
	; OUTLOOP-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; OUTLOOP: middle.block:			; OUTLOOP: middle.block:
	; OUTLOOP-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[TMP19]], [[TMP18]]			; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP8]])
	; OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
	; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]			; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
	; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]			; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
	; OUTLOOP: scalar.ph:			; OUTLOOP: scalar.ph:
	; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]			; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
	; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]			; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
	; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]			; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
	; OUTLOOP: for.body:			; OUTLOOP: for.body:
	; OUTLOOP-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]			; OUTLOOP-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
	; OUTLOOP-NEXT: [[R_07:%.]] = phi i32 [ [[ADD:%.]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]			; OUTLOOP-NEXT: [[R_07:%.]] = phi i32 [ [[ADD:%.]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
	; OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]			; OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
	; OUTLOOP-NEXT: [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX]], align 2			; OUTLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
	; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP24]] to i32			; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
	; OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]			; OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
	; OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1			; OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
	; OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]			; OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
	; OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]			; OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
	; OUTLOOP: for.cond.cleanup.loopexit:			; OUTLOOP: for.cond.cleanup.loopexit:
	; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]			; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
	; OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]			; OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
	; OUTLOOP: for.cond.cleanup:			; OUTLOOP: for.cond.cleanup:
	; OUTLOOP-NEXT: [[R_0_LCSSA:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]			; OUTLOOP-NEXT: [[R_0_LCSSA:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
	; OUTLOOP-NEXT: ret i32 [[R_0_LCSSA]]			; OUTLOOP-NEXT: ret i32 [[R_0_LCSSA]]
	;			;
	; INLOOP-LABEL: @add_i16_i32(			; INLOOP-LABEL: @add_i16_i32(
	; INLOOP-NEXT: entry:			; INLOOP-NEXT: entry:
	; INLOOP-NEXT: [[CMP6:%.]] = icmp sgt i32 [[N:%.]], 0			; INLOOP-NEXT: [[CMP6:%.]] = icmp sgt i32 [[N:%.]], 0
	; INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]			; INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
	; INLOOP: for.body.preheader:			; INLOOP: for.body.preheader:
	; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()			; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
	; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8			; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
	; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]			; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
	; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; INLOOP: vector.ph:			; INLOOP: vector.ph:
	; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()			; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
	; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8			; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
	; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]			; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
	; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]			; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
	; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]			; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
	; INLOOP: vector.body:			; INLOOP: vector.body:
	; INLOOP-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; INLOOP-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; INLOOP-NEXT: [[VEC_PHI:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP19:%.]], [[VECTOR_BODY]] ]			; INLOOP-NEXT: [[VEC_PHI:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.]], [[VECTOR_BODY]] ]
	; INLOOP-NEXT: [[VEC_PHI1:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.]], [[VECTOR_BODY]] ]
	; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0			; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
	; INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()			; INLOOP-NEXT: [[TMP5:%.]] = getelementptr inbounds i16, ptr [[X:%.]], i32 [[TMP4]]
	; INLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4			; INLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
	; INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0			; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP6]], align 2
	; INLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1			; INLOOP-NEXT: [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
	; INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]]			; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
	; INLOOP-NEXT: [[TMP10:%.]] = getelementptr inbounds i16, ptr [[X:%.]], i32 [[TMP4]]			; INLOOP-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
	; INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]]			; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
	; INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0			; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4
	; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP12]], align 2			; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]]
	; INLOOP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
	; INLOOP-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4			; INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; INLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP14]]
	; INLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2
	; INLOOP-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
	; INLOOP-NEXT: [[TMP17:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD2]] to <vscale x 4 x i32>
	; INLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
	; INLOOP-NEXT: [[TMP19]] = add i32 [[TMP18]], [[VEC_PHI]]
	; INLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP17]])
	; INLOOP-NEXT: [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI1]]
	; INLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
	; INLOOP-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 8
	; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP23]]
	; INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
	; INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; INLOOP: middle.block:			; INLOOP: middle.block:
	; INLOOP-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP19]]
	; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]			; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
	; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]			; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
	; INLOOP: scalar.ph:			; INLOOP: scalar.ph:
	; INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]			; INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
	; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]			; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
	; INLOOP-NEXT: br label [[FOR_BODY:%.*]]			; INLOOP-NEXT: br label [[FOR_BODY:%.*]]
	; INLOOP: for.body:			; INLOOP: for.body:
	; INLOOP-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]			; INLOOP-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
	; INLOOP-NEXT: [[R_07:%.]] = phi i32 [ [[ADD:%.]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]			; INLOOP-NEXT: [[R_07:%.]] = phi i32 [ [[ADD:%.]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
	; INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]			; INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
	; INLOOP-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX]], align 2			; INLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
	; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP25]] to i32			; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
	; INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]			; INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
	; INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1			; INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
	; INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]			; INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
	; INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]			; INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
	; INLOOP: for.cond.cleanup.loopexit:			; INLOOP: for.cond.cleanup.loopexit:
	; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]			; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
	; INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]			; INLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
	; INLOOP: for.cond.cleanup:			; INLOOP: for.cond.cleanup:
	; INLOOP-NEXT: [[R_0_LCSSA:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]			; INLOOP-NEXT: [[R_0_LCSSA:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
	; INLOOP-NEXT: ret i32 [[R_0_LCSSA]]			; INLOOP-NEXT: ret i32 [[R_0_LCSSA]]
	;			;
	entry:			entry:
	%cmp6 = icmp sgt i32 %n, 0			%cmp6 = icmp sgt i32 %n, 0
	br i1 %cmp6, label %for.body, label %for.cond.cleanup			br i1 %cmp6, label %for.body, label %for.cond.cleanup
	Show All 16 Lines

llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll

	Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; LMUL1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024			; LMUL1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
	; LMUL1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]			; LMUL1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
	; LMUL1: for.end:			; LMUL1: for.end:
	; LMUL1-NEXT: ret void			; LMUL1-NEXT: ret void
	;			;
	; LMUL2-LABEL: @load_store(			; LMUL2-LABEL: @load_store(
	; LMUL2-NEXT: entry:			; LMUL2-NEXT: entry:
	; LMUL2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; LMUL2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4			; LMUL2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
	; LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; LMUL2: vector.ph:			; LMUL2: vector.ph:
	; LMUL2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; LMUL2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4			; LMUL2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
	; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]]			; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]]
	; LMUL2: vector.body:			; LMUL2: vector.body:
	; LMUL2-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; LMUL2-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; LMUL2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0			; LMUL2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
	; LMUL2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()			; LMUL2-NEXT: [[TMP5:%.]] = getelementptr inbounds i64, ptr [[P:%.]], i64 [[TMP4]]
	; LMUL2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2			; LMUL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
	; LMUL2-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0			; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 4
	; LMUL2-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1			; LMUL2-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
	; LMUL2-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]			; LMUL2-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 4
	; LMUL2-NEXT: [[TMP10:%.]] = getelementptr inbounds i64, ptr [[P:%.]], i64 [[TMP4]]			; LMUL2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]			; LMUL2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2
	; LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0			; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
	; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4			; LMUL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; LMUL2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; LMUL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; LMUL2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
	; LMUL2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
	; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
	; LMUL2-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
	; LMUL2-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
	; LMUL2-NEXT: store <vscale x 2 x i64> [[TMP16]], ptr [[TMP12]], align 4
	; LMUL2-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
	; LMUL2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP19]]
	; LMUL2-NEXT: store <vscale x 2 x i64> [[TMP17]], ptr [[TMP20]], align 4
	; LMUL2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
	; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
	; LMUL2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; LMUL2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; LMUL2: middle.block:			; LMUL2: middle.block:
	; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; LMUL2: scalar.ph:			; LMUL2: scalar.ph:
	; LMUL2-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; LMUL2-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; LMUL2-NEXT: br label [[FOR_BODY:%.*]]			; LMUL2-NEXT: br label [[FOR_BODY:%.*]]
	; LMUL2: for.body:			; LMUL2: for.body:
	; LMUL2-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]			; LMUL2-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]
	; LMUL2-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]			; LMUL2-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
	; LMUL2-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4			; LMUL2-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4
	; LMUL2-NEXT: [[W:%.*]] = add i64 [[V]], 1			; LMUL2-NEXT: [[W:%.*]] = add i64 [[V]], 1
	; LMUL2-NEXT: store i64 [[W]], ptr [[Q]], align 4			; LMUL2-NEXT: store i64 [[W]], ptr [[Q]], align 4
	; LMUL2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1			; LMUL2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
	; LMUL2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024			; LMUL2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
	; LMUL2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]			; LMUL2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
	; LMUL2: for.end:			; LMUL2: for.end:
	; LMUL2-NEXT: ret void			; LMUL2-NEXT: ret void
	;			;
	; LMUL4-LABEL: @load_store(			; LMUL4-LABEL: @load_store(
	; LMUL4-NEXT: entry:			; LMUL4-NEXT: entry:
	; LMUL4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; LMUL4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8			; LMUL4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
	; LMUL4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; LMUL4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; LMUL4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; LMUL4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; LMUL4: vector.ph:			; LMUL4: vector.ph:
	; LMUL4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; LMUL4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8			; LMUL4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
	; LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; LMUL4-NEXT: br label [[VECTOR_BODY:%.*]]			; LMUL4-NEXT: br label [[VECTOR_BODY:%.*]]
	; LMUL4: vector.body:			; LMUL4: vector.body:
	; LMUL4-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; LMUL4-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; LMUL4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0			; LMUL4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
	; LMUL4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()			; LMUL4-NEXT: [[TMP5:%.]] = getelementptr inbounds i64, ptr [[P:%.]], i64 [[TMP4]]
	; LMUL4-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4			; LMUL4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
	; LMUL4-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0			; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 4
	; LMUL4-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1			; LMUL4-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
	; LMUL4-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]			; LMUL4-NEXT: store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 4
	; LMUL4-NEXT: [[TMP10:%.]] = getelementptr inbounds i64, ptr [[P:%.]], i64 [[TMP4]]			; LMUL4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]			; LMUL4-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
	; LMUL4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0			; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
	; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP12]], align 4			; LMUL4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; LMUL4-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; LMUL4-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; LMUL4-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
	; LMUL4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
	; LMUL4-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i64>, ptr [[TMP15]], align 4
	; LMUL4-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
	; LMUL4-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD1]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
	; LMUL4-NEXT: store <vscale x 4 x i64> [[TMP16]], ptr [[TMP12]], align 4
	; LMUL4-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL4-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
	; LMUL4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP19]]
	; LMUL4-NEXT: store <vscale x 4 x i64> [[TMP17]], ptr [[TMP20]], align 4
	; LMUL4-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL4-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8
	; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
	; LMUL4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; LMUL4-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; LMUL4: middle.block:			; LMUL4: middle.block:
	; LMUL4-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; LMUL4-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; LMUL4-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; LMUL4-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; LMUL4: scalar.ph:			; LMUL4: scalar.ph:
	; LMUL4-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; LMUL4-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; LMUL4-NEXT: br label [[FOR_BODY:%.*]]			; LMUL4-NEXT: br label [[FOR_BODY:%.*]]
	; LMUL4: for.body:			; LMUL4: for.body:
	; LMUL4-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]			; LMUL4-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]
	; LMUL4-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]			; LMUL4-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
	; LMUL4-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4			; LMUL4-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 4
	; LMUL4-NEXT: [[W:%.*]] = add i64 [[V]], 1			; LMUL4-NEXT: [[W:%.*]] = add i64 [[V]], 1
	; LMUL4-NEXT: store i64 [[W]], ptr [[Q]], align 4			; LMUL4-NEXT: store i64 [[W]], ptr [[Q]], align 4
	; LMUL4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1			; LMUL4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
	; LMUL4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024			; LMUL4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
	; LMUL4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]			; LMUL4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
	; LMUL4: for.end:			; LMUL4: for.end:
	; LMUL4-NEXT: ret void			; LMUL4-NEXT: ret void
	;			;
	; LMUL8-LABEL: @load_store(			; LMUL8-LABEL: @load_store(
	; LMUL8-NEXT: entry:			; LMUL8-NEXT: entry:
	; LMUL8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; LMUL8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16			; LMUL8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
	; LMUL8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; LMUL8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; LMUL8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; LMUL8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; LMUL8: vector.ph:			; LMUL8: vector.ph:
	; LMUL8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; LMUL8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16			; LMUL8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
	; LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; LMUL8-NEXT: br label [[VECTOR_BODY:%.*]]			; LMUL8-NEXT: br label [[VECTOR_BODY:%.*]]
	; LMUL8: vector.body:			; LMUL8: vector.body:
	; LMUL8-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; LMUL8-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; LMUL8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0			; LMUL8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
	; LMUL8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()			; LMUL8-NEXT: [[TMP5:%.]] = getelementptr inbounds i64, ptr [[P:%.]], i64 [[TMP4]]
	; LMUL8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8			; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
	; LMUL8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0			; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 4
	; LMUL8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1			; LMUL8-NEXT: [[TMP7:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
	; LMUL8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]			; LMUL8-NEXT: store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 4
	; LMUL8-NEXT: [[TMP10:%.]] = getelementptr inbounds i64, ptr [[P:%.]], i64 [[TMP4]]			; LMUL8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]			; LMUL8-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
	; LMUL8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0			; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
	; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP12]], align 4			; LMUL8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; LMUL8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; LMUL8-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; LMUL8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8
	; LMUL8-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
	; LMUL8-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i64>, ptr [[TMP15]], align 4
	; LMUL8-NEXT: [[TMP16:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
	; LMUL8-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD1]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
	; LMUL8-NEXT: store <vscale x 8 x i64> [[TMP16]], ptr [[TMP12]], align 4
	; LMUL8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL8-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
	; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP19]]
	; LMUL8-NEXT: store <vscale x 8 x i64> [[TMP17]], ptr [[TMP20]], align 4
	; LMUL8-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
	; LMUL8-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
	; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
	; LMUL8-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; LMUL8-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; LMUL8: middle.block:			; LMUL8: middle.block:
	; LMUL8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; LMUL8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; LMUL8-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; LMUL8-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; LMUL8: scalar.ph:			; LMUL8: scalar.ph:
	; LMUL8-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; LMUL8-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; LMUL8-NEXT: br label [[FOR_BODY:%.*]]			; LMUL8-NEXT: br label [[FOR_BODY:%.*]]
	; LMUL8: for.body:			; LMUL8: for.body:
	; LMUL8-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]			; LMUL8-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]
	Show All 26 Lines

llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t \| FileCheck %s -check-prefix=VLENUNK			; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t \| FileCheck %s -check-prefix=VLENUNK

	target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"			target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
	target triple = "riscv64"			target triple = "riscv64"

	; FIXME: In this example, we pick a vector index with which is wider than			; FIXME: In this example, we pick a vector index with which is wider than
	; the data width. This is correct, but sub-optimal as it causes a vsetvli			; the data width. This is correct, but sub-optimal as it causes a vsetvli
	; toggle in the generated code for no reason. We could have used a i32			; toggle in the generated code for no reason. We could have used a i32
	; element type for the index here and matched the data width.			; element type for the index here and matched the data width.
	define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {			define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
	; VLENUNK-LABEL: @test(			; VLENUNK-LABEL: @test(
	; VLENUNK-NEXT: entry:			; VLENUNK-NEXT: entry:
	; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4			; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
	; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; VLENUNK: vector.ph:			; VLENUNK: vector.ph:
	; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4			; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
	; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; VLENUNK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()			; VLENUNK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
	; VLENUNK-NEXT: [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer			; VLENUNK-NEXT: [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
	; VLENUNK-NEXT: [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)			; VLENUNK-NEXT: [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
	; VLENUNK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]			; VLENUNK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
	; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2			; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
	; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]			; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
	; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0			; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
	; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer			; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
	; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.]], i64 0			; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.]], i64 0
	; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer			; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
	; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i64 0
	; VLENUNK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
	; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]]			; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]]
	; VLENUNK: vector.body:			; VLENUNK: vector.body:
	; VLENUNK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; VLENUNK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; VLENUNK-NEXT: [[VEC_IND:%.]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[VECTOR_BODY]] ]			; VLENUNK-NEXT: [[VEC_IND:%.]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.]], [[VECTOR_BODY]] ]
	; VLENUNK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
	; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0			; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
	; VLENUNK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 512, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
	; VLENUNK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2			; VLENUNK-NEXT: [[TMP12:%.]] = getelementptr i32, ptr [[A:%.]], i64 [[TMP10]]
	; VLENUNK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0			; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
	; VLENUNK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1			; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP13]], i32 4, <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i32> poison)
	; VLENUNK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]			; VLENUNK-NEXT: [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP11]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
	; VLENUNK-NEXT: [[TMP16:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 512, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)			; VLENUNK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[WIDE_MASKED_LOAD]]
	; VLENUNK-NEXT: [[TMP17:%.*]] = icmp ult <vscale x 2 x i64> [[STEP_ADD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 512, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)			; VLENUNK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
	; VLENUNK-NEXT: [[TMP18:%.]] = getelementptr i32, ptr [[A:%.]], i64 [[TMP10]]			; VLENUNK-NEXT: [[TMP16:%.]] = getelementptr inbounds i32, ptr [[B:%.]], i64 [[TMP10]]
	; VLENUNK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP15]]			; VLENUNK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
	; VLENUNK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0			; VLENUNK-NEXT: store <vscale x 2 x i32> [[TMP15]], ptr [[TMP17]], align 4
	; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP20]], i32 4, <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i32> poison)			; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
	; VLENUNK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2			; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
	; VLENUNK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP22]]			; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
	; VLENUNK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP23]], i32 4, <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i32> poison)			; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; VLENUNK-NEXT: [[TMP24:%.*]] = xor <vscale x 2 x i1> [[TMP16]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)			; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; VLENUNK-NEXT: [[TMP25:%.*]] = xor <vscale x 2 x i1> [[TMP17]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
	; VLENUNK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP24]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[WIDE_MASKED_LOAD]]
	; VLENUNK-NEXT: [[PREDPHI3:%.*]] = select <vscale x 2 x i1> [[TMP25]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[WIDE_MASKED_LOAD2]]
	; VLENUNK-NEXT: [[TMP26:%.*]] = add <vscale x 2 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
	; VLENUNK-NEXT: [[TMP27:%.*]] = add <vscale x 2 x i32> [[PREDPHI3]], [[BROADCAST_SPLAT5]]
	; VLENUNK-NEXT: [[TMP28:%.]] = getelementptr inbounds i32, ptr [[B:%.]], i64 [[TMP10]]
	; VLENUNK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP15]]
	; VLENUNK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 0
	; VLENUNK-NEXT: store <vscale x 2 x i32> [[TMP26]], ptr [[TMP30]], align 4
	; VLENUNK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 2
	; VLENUNK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP32]]
	; VLENUNK-NEXT: store <vscale x 2 x i32> [[TMP27]], ptr [[TMP33]], align 4
	; VLENUNK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4
	; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]]
	; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
	; VLENUNK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; VLENUNK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
	; VLENUNK: middle.block:			; VLENUNK: middle.block:
	; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; VLENUNK: scalar.ph:			; VLENUNK: scalar.ph:
	; VLENUNK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; VLENUNK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; VLENUNK-NEXT: br label [[FOR_BODY:%.*]]			; VLENUNK-NEXT: br label [[FOR_BODY:%.*]]
	; VLENUNK: for.body:			; VLENUNK: for.body:
	; VLENUNK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[LATCH:%.*]] ]			; VLENUNK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[LATCH:%.*]] ]
	▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

	Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: LV(REG): Found max usage: 2 item			; CHECK-NEXT: LV(REG): Found max usage: 2 item
	; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers			; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
	; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers			; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
	; CHECK-NEXT: LV(REG): Found invariant usage: 1 item			; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
	; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers			; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers
	; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class			; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
	; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class			; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
	; CHECK-NEXT: LV: Loop cost is 23			; CHECK-NEXT: LV: Loop cost is 23
	; CHECK-NEXT: LV: IC is 2			; CHECK-NEXT: LV: IC is 1
	; CHECK-NEXT: LV: VF is vscale x 4			; CHECK-NEXT: LV: VF is vscale x 4
	; CHECK-NEXT: LV: Not Interleaving.			; CHECK-NEXT: LV: Not Interleaving.
	; CHECK-NEXT: LV: Interleaving is not beneficial.			; CHECK-NEXT: LV: Interleaving is not beneficial.
	; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>			; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
	; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop			; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
	; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1			; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
	; CHECK-NEXT: LV: Interleaving disabled by the pass manager			; CHECK-NEXT: LV: Interleaving disabled by the pass manager
	;			;
	▲ Show 20 Lines • Show All 115 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: LV(REG): Found max usage: 2 item			; CHECK-NEXT: LV(REG): Found max usage: 2 item
	; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers			; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
	; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers			; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers
	; CHECK-NEXT: LV(REG): Found invariant usage: 1 item			; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
	; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers			; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers
	; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class			; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
	; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class			; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
	; CHECK-NEXT: LV: Loop cost is 23			; CHECK-NEXT: LV: Loop cost is 23
	; CHECK-NEXT: LV: IC is 2			; CHECK-NEXT: LV: IC is 1
	; CHECK-NEXT: LV: VF is vscale x 4			; CHECK-NEXT: LV: VF is vscale x 4
	; CHECK-NEXT: LV: Not Interleaving.			; CHECK-NEXT: LV: Not Interleaving.
	; CHECK-NEXT: LV: Interleaving is not beneficial.			; CHECK-NEXT: LV: Interleaving is not beneficial.
	; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>			; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
	; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop			; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
	; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1			; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
	; CHECK-NEXT: LV: Interleaving disabled by the pass manager			; CHECK-NEXT: LV: Interleaving disabled by the pass manager
	;			;
	Show All 31 Lines

llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll

	Show First 20 Lines • Show All 115 Lines • ▼ Show 20 Lines

	; Same as above, but with op type of i32. We currently have a bug around			; Same as above, but with op type of i32. We currently have a bug around
	; etype=ELEN profitability in the vectorizer, and having a smaller element			; etype=ELEN profitability in the vectorizer, and having a smaller element
	; width test allows us to highlight different aspects of codegen.			; width test allows us to highlight different aspects of codegen.
	define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {			define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
	; VLENUNK-LABEL: @vector_add_i32(			; VLENUNK-LABEL: @vector_add_i32(
	; VLENUNK-NEXT: entry:			; VLENUNK-NEXT: entry:
	; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4			; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
	; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; VLENUNK: vector.ph:			; VLENUNK: vector.ph:
	; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4			; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
	; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.]], i64 0			; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.]], i64 0
	; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer			; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
	; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i64 0
	; VLENUNK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
	; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]]			; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]]
	; VLENUNK: vector.body:			; VLENUNK: vector.body:
	; VLENUNK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; VLENUNK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0			; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
	; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: [[TMP5:%.]] = getelementptr inbounds i32, ptr [[A:%.]], i64 [[TMP4]]
	; VLENUNK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2			; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
	; VLENUNK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0			; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 4
	; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1			; VLENUNK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
	; VLENUNK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]			; VLENUNK-NEXT: store <vscale x 2 x i32> [[TMP7]], ptr [[TMP6]], align 4
	; VLENUNK-NEXT: [[TMP10:%.]] = getelementptr inbounds i32, ptr [[A:%.]], i64 [[TMP4]]			; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]			; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2
	; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0			; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
	; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 4			; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; VLENUNK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
	; VLENUNK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
	; VLENUNK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]]
	; VLENUNK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP15]], align 4
	; VLENUNK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
	; VLENUNK-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
	; VLENUNK-NEXT: store <vscale x 2 x i32> [[TMP16]], ptr [[TMP12]], align 4
	; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
	; VLENUNK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP19]]
	; VLENUNK-NEXT: store <vscale x 2 x i32> [[TMP17]], ptr [[TMP20]], align 4
	; VLENUNK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
	; VLENUNK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
	; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
	; VLENUNK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; VLENUNK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
	; VLENUNK: middle.block:			; VLENUNK: middle.block:
	; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; VLENUNK: scalar.ph:			; VLENUNK: scalar.ph:
	; VLENUNK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; VLENUNK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; VLENUNK-NEXT: br label [[FOR_BODY:%.*]]			; VLENUNK-NEXT: br label [[FOR_BODY:%.*]]
	; VLENUNK: for.body:			; VLENUNK: for.body:
	; VLENUNK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]			; VLENUNK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]
	; VLENUNK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]			; VLENUNK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
	; VLENUNK-NEXT: [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4			; VLENUNK-NEXT: [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
	; VLENUNK-NEXT: [[ADD:%.*]] = add i32 [[ELEM]], [[V]]			; VLENUNK-NEXT: [[ADD:%.*]] = add i32 [[ELEM]], [[V]]
	; VLENUNK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4			; VLENUNK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
	; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1			; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
	; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024			; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
	; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]			; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
	; VLENUNK: for.end:			; VLENUNK: for.end:
	; VLENUNK-NEXT: ret void			; VLENUNK-NEXT: ret void
	;			;
	; VLEN128-LABEL: @vector_add_i32(			; VLEN128-LABEL: @vector_add_i32(
	; VLEN128-NEXT: entry:			; VLEN128-NEXT: entry:
	; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()			; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
	; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4			; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
	; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]			; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
	; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]			; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_PH:%.]]
	; VLEN128: vector.ph:			; VLEN128: vector.ph:
	; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()			; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
	; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4			; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
	; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]			; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
	; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]			; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
	; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.]], i64 0			; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.]], i64 0
	; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer			; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
	; VLEN128-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i64 0
	; VLEN128-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
	; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]]			; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]]
	; VLEN128: vector.body:			; VLEN128: vector.body:
	; VLEN128-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]			; VLEN128-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
	; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0			; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
	; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()			; VLEN128-NEXT: [[TMP5:%.]] = getelementptr inbounds i32, ptr [[A:%.]], i64 [[TMP4]]
	; VLEN128-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2			; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
	; VLEN128-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0			; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 4
	; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1			; VLEN128-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
	; VLEN128-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]			; VLEN128-NEXT: store <vscale x 2 x i32> [[TMP7]], ptr [[TMP6]], align 4
	; VLEN128-NEXT: [[TMP10:%.]] = getelementptr inbounds i32, ptr [[A:%.]], i64 [[TMP4]]			; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
	; VLEN128-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]			; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2
	; VLEN128-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0			; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
	; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 4			; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; VLEN128-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()			; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
	; VLEN128-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
	; VLEN128-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]]
	; VLEN128-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP15]], align 4
	; VLEN128-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
	; VLEN128-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
	; VLEN128-NEXT: store <vscale x 2 x i32> [[TMP16]], ptr [[TMP12]], align 4
	; VLEN128-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
	; VLEN128-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
	; VLEN128-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP19]]
	; VLEN128-NEXT: store <vscale x 2 x i32> [[TMP17]], ptr [[TMP20]], align 4
	; VLEN128-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
	; VLEN128-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
	; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
	; VLEN128-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
	; VLEN128-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
	; VLEN128: middle.block:			; VLEN128: middle.block:
	; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]			; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
	; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]			; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
	; VLEN128: scalar.ph:			; VLEN128: scalar.ph:
	; VLEN128-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]			; VLEN128-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ]
	; VLEN128-NEXT: br label [[FOR_BODY:%.*]]			; VLEN128-NEXT: br label [[FOR_BODY:%.*]]
	; VLEN128: for.body:			; VLEN128: for.body:
	; VLEN128-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]			; VLEN128-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[FOR_BODY]] ]
	▲ Show 20 Lines • Show All 485 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll

	; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on \			; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on \
	; RUN: -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \			; RUN: -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \
	; RUN: -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \			; RUN: -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \
	; RUN: -pass-remarks-missed=loop-vectorize -mtriple riscv64-linux-gnu \			; RUN: -pass-remarks-missed=loop-vectorize -mtriple riscv64-linux-gnu \
	; RUN: -mattr=+v,+f -S 2>%t \| FileCheck %s -check-prefix=CHECK			; RUN: -force-target-max-vector-interleave=2 -mattr=+v,+f -S 2>%t \
				; RUN: \| FileCheck %s -check-prefix=CHECK
	; RUN: cat %t \| FileCheck %s -check-prefix=CHECK-REMARK			; RUN: cat %t \| FileCheck %s -check-prefix=CHECK-REMARK

	; Reduction can be vectorized			; Reduction can be vectorized

	; ADD			; ADD

	; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)			; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
	define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {			define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
	▲ Show 20 Lines • Show All 426 Lines • Show Last 20 Lines