This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Match splatted load to scalar load + splat. Form strided load during isel.
ClosedPublic

Authored by craig.topper on Apr 22 2021, 11:23 PM.

Download Raw Diff

Details

Reviewers

frasercrmck
khchen
arcbbb
HsiangKai
evandro

Commits

rGe2cd92cb9bc2: [RISCV] Match splatted load to scalar load + splat. Form strided load during…

Summary

This modifies my previous patch to push the strided load formation
to isel. This gives us opportunity to fold the splat into a .vx
operation first. Using a scalar register and a .vx operation reduces
vector register pressure which can be important for larger LMULs.

If we can't fold the splat into a .vx operation, then it can make
sense to use a strided load to free up the vector arithmetic
ALU to do actual arithmetic rather than tying it up with vmv.v.x.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

craig.topper created this revision.Apr 22 2021, 11:23 PM

Herald added subscribers: StephenFan, vkmr, luismarques and 24 others. · View Herald TranscriptApr 22 2021, 11:23 PM

craig.topper requested review of this revision.Apr 22 2021, 11:23 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 22 2021, 11:23 PM

Herald added a subscriber: MaskRay. · View Herald Transcript

Harbormaster completed remote builds in B100470: Diff 339894.Apr 23 2021, 12:10 AM

LGTM.

This revision is now accepted and ready to land.Apr 26 2021, 9:29 AM

Closed by commit rGe2cd92cb9bc2: [RISCV] Match splatted load to scalar load + splat. Form strided load during… (authored by craig.topper). · Explain WhyApr 26 2021, 1:36 PM

This revision was automatically updated to reflect the committed changes.

craig.topper added a commit: rGe2cd92cb9bc2: [RISCV] Match splatted load to scalar load + splat. Form strided load during….

• pcwang-thead mentioned this in D138101: [RISCV] Lower unmasked zero-stride vector load to (scalar load + splat).Nov 16 2022, 12:30 AM

• pcwang-thead mentioned this in rG241accea2a9d: [RISCV] Lower unmasked zero-stride vector load to (scalar load + splat).Nov 23 2022, 7:12 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

RISCV/

RISCVISelDAGToDAG.cpp

38 lines

RISCVISelLowering.cpp

45 lines

test/

CodeGen/

RISCV/

rvv/

fixed-vectors-fp-buildvec.ll

14 lines

fixed-vectors-fp-shuffles.ll

30 lines

fixed-vectors-insert.ll

8 lines

Diff 340629

llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp

Show First 20 Lines • Show All 1,132 Lines • ▼ Show 20 Lines	if (SubRegIdx == RISCV::NoSubRegister) {
ReplaceNode(Node, NewNode);		ReplaceNode(Node, NewNode);
return;		return;
}		}

SDValue Extract = CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, V);		SDValue Extract = CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, V);
ReplaceNode(Node, Extract.getNode());		ReplaceNode(Node, Extract.getNode());
return;		return;
}		}
		case RISCVISD::VMV_V_X_VL:
		case RISCVISD::VFMV_V_F_VL: {
		// Try to match splat of a scalar load to a strided load with stride of x0.
		SDValue Src = Node->getOperand(0);
		auto *Ld = dyn_cast<LoadSDNode>(Src);
		if (!Ld)
		break;
		EVT MemVT = Ld->getMemoryVT();
		// The memory VT should be the same size as the element type.
		if (MemVT.getStoreSize() != VT.getVectorElementType().getStoreSize())
		break;
		if (!IsProfitableToFold(Src, Node, Node) \|\|
		!IsLegalToFold(Src, Node, Node, TM.getOptLevel()))
		break;

		SDValue VL;
		selectVLOp(Node->getOperand(1), VL);

		unsigned ScalarSize = VT.getScalarSizeInBits();
		SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);

		SDValue Operands[] = {Ld->getBasePtr(),
		CurDAG->getRegister(RISCV::X0, XLenVT), VL, SEW,
		Ld->getChain()};

		RISCVVLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
		const RISCV::VLEPseudo *P = RISCV::getVLEPseudo(
		/IsMasked/ false, /IsStrided/ true, /FF/ false, ScalarSize,
		static_cast<unsigned>(LMUL));
		MachineSDNode *Load =
		CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);

		if (auto *MemOp = dyn_cast<MemSDNode>(Node))
		CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});

		ReplaceNode(Node, Load);
		return;
		}
}		}

// Select the default instruction.		// Select the default instruction.
SelectCode(Node);		SelectCode(Node);
}		}

bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(		bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {		const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
▲ Show 20 Lines • Show All 402 Lines • Show Last 20 Lines

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,626 Lines • ▼ Show 20 Lines	if (Lane >= 0) {

// We need to ensure the load isn't atomic or volatile.		// We need to ensure the load isn't atomic or volatile.
if (ISD::isNormalLoad(V.getNode()) && cast<LoadSDNode>(V)->isSimple()) {		if (ISD::isNormalLoad(V.getNode()) && cast<LoadSDNode>(V)->isSimple()) {
auto *Ld = cast<LoadSDNode>(V);		auto *Ld = cast<LoadSDNode>(V);
Offset *= SVT.getStoreSize();		Offset *= SVT.getStoreSize();
SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),		SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
TypeSize::Fixed(Offset), DL);		TypeSize::Fixed(Offset), DL);

		// If this is SEW=64 on RV32, use a strided load with a stride of x0.
		if (SVT.isInteger() && SVT.bitsGT(XLenVT)) {
SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});		SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
SDValue IntID =		SDValue IntID =
DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);		DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,		SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
DAG.getRegister(RISCV::X0, XLenVT), VL};		DAG.getRegister(RISCV::X0, XLenVT), VL};
SDValue NewLoad = DAG.getMemIntrinsicNode(		SDValue NewLoad = DAG.getMemIntrinsicNode(
ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,		ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
DAG.getMachineFunction().getMachineMemOperand(		DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));		Ld->getMemOperand(), Offset, SVT.getStoreSize()));
DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);		DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);		return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
}		}

		// Otherwise use a scalar load and splat. This will give the best
		// opportunity to fold a splat into the operation. ISel can turn it into
		// the x0 strided load if we aren't able to fold away the select.
		if (SVT.isFloatingPoint())
		V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
		Ld->getPointerInfo().getWithOffset(Offset),
		Ld->getOriginalAlign(),
		Ld->getMemOperand()->getFlags());
		else
		V = DAG.getExtLoad(ISD::SEXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr,
		Ld->getPointerInfo().getWithOffset(Offset), SVT,
		Ld->getOriginalAlign(),
		Ld->getMemOperand()->getFlags());
		DAG.makeEquivalentMemoryOrdering(Ld, V);

		unsigned Opc =
		VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
		SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL);
		return convertFromScalableVector(VT, Splat, DAG, Subtarget);
		}

V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);		V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
assert(Lane < (int)NumElts && "Unexpected lane!");		assert(Lane < (int)NumElts && "Unexpected lane!");
SDValue Gather =		SDValue Gather =
DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1,		DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1,
DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL);		DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL);
return convertFromScalableVector(VT, Gather, DAG, Subtarget);		return convertFromScalableVector(VT, Gather, DAG, Subtarget);
}		}
}		}
▲ Show 20 Lines • Show All 6,600 Lines • Show Last 20 Lines

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

	Show All 17 Lines
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	store <4 x float> <float 0.0, float 4.0, float 0.0, float 2.0>, <4 x float>* %x			store <4 x float> <float 0.0, float 4.0, float 0.0, float 2.0>, <4 x float>* %x
	ret void			ret void
	}			}

	define void @buildvec_dominant0_v4f32(<4 x float>* %x) {			define void @buildvec_dominant0_v4f32(<4 x float>* %x) {
	; CHECK-LABEL: buildvec_dominant0_v4f32:			; CHECK-LABEL: buildvec_dominant0_v4f32:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: lui a1, %hi(.LCPI1_0)
	; CHECK-NEXT: flw ft0, %lo(.LCPI1_0)(a1)
	; CHECK-NEXT: fmv.w.x ft1, zero
	; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu			; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
	; CHECK-NEXT: vfmv.s.f v25, ft1			; CHECK-NEXT: lui a1, %hi(.LCPI1_0)
	; CHECK-NEXT: vfmv.v.f v26, ft0			; CHECK-NEXT: addi a1, a1, %lo(.LCPI1_0)
				; CHECK-NEXT: vlse32.v v25, (a1), zero
				; CHECK-NEXT: fmv.w.x ft0, zero
				; CHECK-NEXT: vfmv.s.f v26, ft0
	; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu			; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu
	; CHECK-NEXT: vslideup.vi v26, v25, 2			; CHECK-NEXT: vslideup.vi v25, v26, 2
	; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu			; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
	; CHECK-NEXT: vse32.v v26, (a0)			; CHECK-NEXT: vse32.v v25, (a0)
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	store <4 x float> <float 2.0, float 2.0, float 0.0, float 2.0>, <4 x float>* %x			store <4 x float> <float 2.0, float 2.0, float 0.0, float 2.0>, <4 x float>* %x
	ret void			ret void
	}			}

	define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) {			define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) {
	; CHECK-LABEL: buildvec_dominant1_v4f32:			; CHECK-LABEL: buildvec_dominant1_v4f32:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll

Show First 20 Lines • Show All 153 Lines • ▼ Show 20 Lines	; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>		%s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
ret <4 x double> %s		ret <4 x double> %s
}		}

define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {		define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
; RV32-LABEL: vrgather_shuffle_xv_v4f64:		; RV32-LABEL: vrgather_shuffle_xv_v4f64:
; RV32: # %bb.0:		; RV32: # %bb.0:
; RV32-NEXT: addi a0, zero, 12		; RV32-NEXT: addi a0, zero, 12
; RV32-NEXT: lui a1, %hi(.LCPI7_0)
; RV32-NEXT: fld ft0, %lo(.LCPI7_0)(a1)
; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu		; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
; RV32-NEXT: vmv.s.x v0, a0		; RV32-NEXT: vmv.s.x v0, a0
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu		; RV32-NEXT: lui a0, %hi(.LCPI7_0)
; RV32-NEXT: vfmv.v.f v26, ft0		; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0)
		; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu
		; RV32-NEXT: vlse64.v v26, (a0), zero
; RV32-NEXT: lui a0, %hi(.LCPI7_1)		; RV32-NEXT: lui a0, %hi(.LCPI7_1)
; RV32-NEXT: addi a0, a0, %lo(.LCPI7_1)		; RV32-NEXT: addi a0, a0, %lo(.LCPI7_1)
; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu		; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
; RV32-NEXT: vle16.v v25, (a0)		; RV32-NEXT: vle16.v v25, (a0)
; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu		; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV32-NEXT: vrgatherei16.vv v26, v8, v25, v0.t		; RV32-NEXT: vrgatherei16.vv v26, v8, v25, v0.t
; RV32-NEXT: vmv2r.v v8, v26		; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret		; RV32-NEXT: ret
;		;
; RV64-LABEL: vrgather_shuffle_xv_v4f64:		; RV64-LABEL: vrgather_shuffle_xv_v4f64:
; RV64: # %bb.0:		; RV64: # %bb.0:
; RV64-NEXT: addi a0, zero, 12		; RV64-NEXT: addi a0, zero, 12
; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu		; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
; RV64-NEXT: vmv.s.x v0, a0		; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: lui a0, %hi(.LCPI7_0)		; RV64-NEXT: lui a0, %hi(.LCPI7_0)
; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0)		; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0)
; RV64-NEXT: lui a1, %hi(.LCPI7_1)
; RV64-NEXT: fld ft0, %lo(.LCPI7_1)(a1)
; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu		; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
; RV64-NEXT: vle64.v v28, (a0)		; RV64-NEXT: vle64.v v28, (a0)
; RV64-NEXT: vfmv.v.f v26, ft0		; RV64-NEXT: lui a0, %hi(.LCPI7_1)
		; RV64-NEXT: addi a0, a0, %lo(.LCPI7_1)
		; RV64-NEXT: vlse64.v v26, (a0), zero
; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu		; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t		; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t
; RV64-NEXT: vmv2r.v v8, v26		; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret		; RV64-NEXT: ret
%s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>		%s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x double> %s		ret <4 x double> %s
}		}

define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {		define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
; RV32-LABEL: vrgather_shuffle_vx_v4f64:		; RV32-LABEL: vrgather_shuffle_vx_v4f64:
; RV32: # %bb.0:		; RV32: # %bb.0:
; RV32-NEXT: addi a0, zero, 3		; RV32-NEXT: addi a0, zero, 3
; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu		; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
; RV32-NEXT: vmv.s.x v0, a0		; RV32-NEXT: vmv.s.x v0, a0
; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu		; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
; RV32-NEXT: vmv.s.x v25, a0		; RV32-NEXT: vmv.s.x v25, a0
; RV32-NEXT: vmv.v.i v28, 0		; RV32-NEXT: vmv.v.i v28, 0
; RV32-NEXT: lui a0, %hi(.LCPI8_0)
; RV32-NEXT: fld ft0, %lo(.LCPI8_0)(a0)
; RV32-NEXT: vsetivli a0, 2, e16,m1,tu,mu		; RV32-NEXT: vsetivli a0, 2, e16,m1,tu,mu
; RV32-NEXT: vslideup.vi v28, v25, 1		; RV32-NEXT: vslideup.vi v28, v25, 1
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu		; RV32-NEXT: lui a0, %hi(.LCPI8_0)
; RV32-NEXT: vfmv.v.f v26, ft0		; RV32-NEXT: addi a0, a0, %lo(.LCPI8_0)
		; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu
		; RV32-NEXT: vlse64.v v26, (a0), zero
; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu		; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV32-NEXT: vrgatherei16.vv v26, v8, v28, v0.t		; RV32-NEXT: vrgatherei16.vv v26, v8, v28, v0.t
; RV32-NEXT: vmv2r.v v8, v26		; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret		; RV32-NEXT: ret
;		;
; RV64-LABEL: vrgather_shuffle_vx_v4f64:		; RV64-LABEL: vrgather_shuffle_vx_v4f64:
; RV64: # %bb.0:		; RV64: # %bb.0:
; RV64-NEXT: addi a0, zero, 3		; RV64-NEXT: addi a0, zero, 3
; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu		; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
; RV64-NEXT: vmv.s.x v26, a0		; RV64-NEXT: vmv.s.x v26, a0
; RV64-NEXT: vmv.v.i v28, 0		; RV64-NEXT: vmv.v.i v28, 0
; RV64-NEXT: vsetivli a1, 2, e64,m2,tu,mu		; RV64-NEXT: vsetivli a1, 2, e64,m2,tu,mu
; RV64-NEXT: vslideup.vi v28, v26, 1		; RV64-NEXT: vslideup.vi v28, v26, 1
; RV64-NEXT: lui a1, %hi(.LCPI8_0)
; RV64-NEXT: fld ft0, %lo(.LCPI8_0)(a1)
; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu		; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
; RV64-NEXT: vmv.s.x v0, a0		; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu		; RV64-NEXT: lui a0, %hi(.LCPI8_0)
; RV64-NEXT: vfmv.v.f v26, ft0		; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0)
		; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
		; RV64-NEXT: vlse64.v v26, (a0), zero
; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu		; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t		; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t
; RV64-NEXT: vmv2r.v v8, v26		; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret		; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>		%s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x double> %s		ret <4 x double> %s
}		}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll

	Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; RV32-LABEL: insertelt_v3i64:			; RV32-LABEL: insertelt_v3i64:
	; RV32: # %bb.0:			; RV32: # %bb.0:
	; RV32-NEXT: vsetivli a3, 2, e64,m1,ta,mu			; RV32-NEXT: vsetivli a3, 2, e64,m1,ta,mu
	; RV32-NEXT: vle64.v v26, (a0)			; RV32-NEXT: vle64.v v26, (a0)
	; RV32-NEXT: vsetivli a3, 8, e32,m2,ta,mu			; RV32-NEXT: vsetivli a3, 8, e32,m2,ta,mu
	; RV32-NEXT: vmv.v.i v28, 0			; RV32-NEXT: vmv.v.i v28, 0
	; RV32-NEXT: vsetivli a3, 2, e64,m2,tu,mu			; RV32-NEXT: vsetivli a3, 2, e64,m2,tu,mu
	; RV32-NEXT: vslideup.vi v28, v26, 0			; RV32-NEXT: vslideup.vi v28, v26, 0
	; RV32-NEXT: lw a3, 20(a0)			; RV32-NEXT: addi a3, a0, 20
	; RV32-NEXT: vsetivli a4, 4, e32,m1,ta,mu			; RV32-NEXT: vsetivli a4, 4, e32,m1,ta,mu
	; RV32-NEXT: lw a4, 16(a0)			; RV32-NEXT: vlse32.v v26, (a3), zero
	; RV32-NEXT: vmv.v.x v26, a3			; RV32-NEXT: lw a3, 16(a0)
	; RV32-NEXT: vmv.s.x v26, a4			; RV32-NEXT: vmv.s.x v26, a3
	; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu			; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu
	; RV32-NEXT: vslideup.vi v28, v26, 2			; RV32-NEXT: vslideup.vi v28, v26, 2
	; RV32-NEXT: vsetivli a3, 2, e32,m2,ta,mu			; RV32-NEXT: vsetivli a3, 2, e32,m2,ta,mu
	; RV32-NEXT: vmv.v.i v26, 0			; RV32-NEXT: vmv.v.i v26, 0
	; RV32-NEXT: vslide1up.vx v30, v26, a2			; RV32-NEXT: vslide1up.vx v30, v26, a2
	; RV32-NEXT: vslide1up.vx v26, v30, a1			; RV32-NEXT: vslide1up.vx v26, v30, a1
	; RV32-NEXT: vsetivli a3, 3, e64,m2,tu,mu			; RV32-NEXT: vsetivli a3, 3, e64,m2,tu,mu
	; RV32-NEXT: vslideup.vi v28, v26, 2			; RV32-NEXT: vslideup.vi v28, v26, 2
	▲ Show 20 Lines • Show All 253 Lines • Show Last 20 Lines