This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Reduce VL of splat instructions used to feed stores
Needs ReviewPublic

Authored by reames on Nov 11 2022, 11:30 AM.

Download Raw Diff

Details

Reviewers

craig.topper
frasercrmck
asb

Summary

If we splat a value into N lanes, and then the only use of the resulting vector is a store of M lanes, we can reduce the size of the splat to min(M,N). Lanes between M and N (if M < N) can't be observed.

In principle, reducing the width of the splat could cause an additional VL toggle, but this appears not to happen much in practice.

This is a carved down version D130895. The intention of this patch is to provide a base from which complexity of that patch can be added upon with incremental review.

Diff Detail

Unit TestsFailed

	Time	Test
	60,090 ms	x64 debian > ThreadSanitizer-x86_64.ThreadSanitizer-x86_64::restore_stack.cpp
	60,060 ms	x64 debian > libFuzzer.libFuzzer::minimize_crash.test

Event Timeline

reames created this revision.Nov 11 2022, 11:30 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 11 2022, 11:30 AM

Herald added subscribers: sunshaoce, VincentWu, StephenFan and 29 others. · View Herald Transcript

reames requested review of this revision.Nov 11 2022, 11:30 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 11 2022, 11:30 AM

Herald added subscribers: • pcwang-thead, eopXD, MaskRay. · View Herald Transcript

reames mentioned this in D130895: [RISCV] Make VL choosing for a splat-like VMV based on its users.Nov 11 2022, 11:38 AM

Harbormaster completed remote builds in B197275: Diff 474816.Nov 11 2022, 12:31 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

RISCV/

RISCVISelDAGToDAG.h

1 line

RISCVISelDAGToDAG.cpp

123 lines

test/

CodeGen/

RISCV/

rvv/

vsetvli-insert-crossbb.ll

11 lines

Diff 474816

llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h

	Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines
	#include "RISCVGenDAGISel.inc"			#include "RISCVGenDAGISel.inc"

	private:			private:
	bool doPeepholeSExtW(SDNode *Node);			bool doPeepholeSExtW(SDNode *Node);
	bool doPeepholeMaskedRVV(SDNode *Node);			bool doPeepholeMaskedRVV(SDNode *Node);
	bool doPeepholeMergeVVMFold();			bool doPeepholeMergeVVMFold();
	bool performVMergeToVAdd(SDNode *N);			bool performVMergeToVAdd(SDNode *N);
	bool performCombineVMergeAndVOps(SDNode *N, bool IsTA);			bool performCombineVMergeAndVOps(SDNode *N, bool IsTA);
				SDNode tryShrinkVLForVMV(SDNode Node);
	};			};

	namespace RISCV {			namespace RISCV {
	struct VLSEGPseudo {			struct VLSEGPseudo {
	uint16_t NF : 4;			uint16_t NF : 4;
	uint16_t Masked : 1;			uint16_t Masked : 1;
	uint16_t IsTU : 1;			uint16_t IsTU : 1;
	uint16_t Strided : 1;			uint16_t Strided : 1;
	▲ Show 20 Lines • Show All 86 Lines • Show Last 20 Lines

llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp

Show First 20 Lines • Show All 50 Lines • ▼ Show 20 Lines
}		}

static unsigned getVecPolicyOpIdx(const SDNode *Node, const MCInstrDesc &MCID) {		static unsigned getVecPolicyOpIdx(const SDNode *Node, const MCInstrDesc &MCID) {
assert(RISCVII::hasVecPolicyOp(MCID.TSFlags));		assert(RISCVII::hasVecPolicyOp(MCID.TSFlags));
(void)MCID;		(void)MCID;
return getLastNonGlueOrChainOpIdx(Node);		return getLastNonGlueOrChainOpIdx(Node);
}		}

		static unsigned getSEWOpIdx(const SDNode *Node, const MCInstrDesc &MCID) {
		assert(RISCVII::hasSEWOp(MCID.TSFlags));
		unsigned SEWOpIdx = getLastNonGlueOrChainOpIdx(Node);
		if (RISCVII::hasVecPolicyOp(MCID.TSFlags))
		--SEWOpIdx;
		return SEWOpIdx;
		}

		static unsigned getVLOpIdx(const SDNode *Node, const MCInstrDesc &MCID) {
		assert(RISCVII::hasVLOp(MCID.TSFlags) && RISCVII::hasSEWOp(MCID.TSFlags));
		// Instruction with VL operand also has SEW that is right after it.
		return getSEWOpIdx(Node, MCID) - 1;
		}

		static unsigned getSEWOp(const SDNode *Node, const MCInstrDesc &MCID) {
		assert(RISCVII::hasSEWOp(MCID.TSFlags));
		unsigned Log2SEW = Node->getConstantOperandVal(getSEWOpIdx(Node, MCID));
		unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
		return SEW;
		}

		static SDValue getVLOperand(const SDNode *Node, const MCInstrDesc &MCID) {
		assert(RISCVII::hasVLOp(MCID.TSFlags));
		return Node->getOperand(getVLOpIdx(Node, MCID));
		}

void RISCVDAGToDAGISel::PreprocessISelDAG() {		void RISCVDAGToDAGISel::PreprocessISelDAG() {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();		SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();

bool MadeChange = false;		bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {		while (Position != CurDAG->allnodes_begin()) {
SDNode N = &--Position;		SDNode N = &--Position;
if (N->use_empty())		if (N->use_empty())
continue;		continue;
▲ Show 20 Lines • Show All 1,714 Lines • ▼ Show 20 Lines	if (SubRegIdx == RISCV::NoSubRegister) {
ReplaceNode(Node, NewNode);		ReplaceNode(Node, NewNode);
return;		return;
}		}

SDValue Extract = CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, V);		SDValue Extract = CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, V);
ReplaceNode(Node, Extract.getNode());		ReplaceNode(Node, Extract.getNode());
return;		return;
}		}
case RISCVISD::VMV_S_X_VL:
case RISCVISD::VFMV_S_F_VL:
case RISCVISD::VMV_V_X_VL:		case RISCVISD::VMV_V_X_VL:
case RISCVISD::VFMV_V_F_VL: {		case RISCVISD::VFMV_V_F_VL: {
		// Try to shrink VL for a splat-like move.
		SDNode *UpdatedNode = tryShrinkVLForVMV(Node);
		if (UpdatedNode != Node) {
		ReplaceNode(Node, UpdatedNode);
		return;
		}
		[[fallthrough]];
		}
		case RISCVISD::VMV_S_X_VL:
		case RISCVISD::VFMV_S_F_VL: {
// Try to match splat of a scalar load to a strided load with stride of x0.		// Try to match splat of a scalar load to a strided load with stride of x0.
bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL \|\|		bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL \|\|
Node->getOpcode() == RISCVISD::VFMV_S_F_VL;		Node->getOpcode() == RISCVISD::VFMV_S_F_VL;
if (!Node->getOperand(0).isUndef())		if (!Node->getOperand(0).isUndef())
break;		break;
SDValue Src = Node->getOperand(1);		SDValue Src = Node->getOperand(1);
auto *Ld = dyn_cast<LoadSDNode>(Src);		auto *Ld = dyn_cast<LoadSDNode>(Src);
if (!Ld)		if (!Ld)
▲ Show 20 Lines • Show All 670 Lines • ▼ Show 20 Lines	if (auto *C = dyn_cast<ConstantSDNode>(N)) {

Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), Subtarget->getXLenVT());		Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), Subtarget->getXLenVT());
return true;		return true;
}		}

return false;		return false;
}		}


		static bool isVLMax(SDValue VL) {
		if (auto *Constant = dyn_cast<ConstantSDNode>(VL))
		return Constant->getSExtValue() == RISCV::VLMaxSentinel;
		auto *RegVL = dyn_cast<RegisterSDNode>(VL);
		return RegVL && RegVL->getReg() == RISCV::X0;
		}

		static bool isVLLessThan(SDValue VL1, SDValue VL2) {
		assert(VL1 && VL2);
		if (isVLMax(VL1))
		return false;
		if (isVLMax(VL2))
		return true;
		auto *ConstantVL1 = dyn_cast<ConstantSDNode>(VL1);
		auto *ConstantVL2 = dyn_cast<ConstantSDNode>(VL2);
		if (!ConstantVL1 \|\| !ConstantVL2)
		// Cannot compare reg-reg/constant-reg/reg-constant cases apart from X0
		// and VLMaxSentinel that are handled above.
		return false;
		return ConstantVL1->getSExtValue() < ConstantVL2->getSExtValue();
		}

		/// Returns true if the user instruction has a VL operand, and is
		/// known to demand only that number of lanes from this input use.
		static bool allowsVLShrinking(const SDUse &Use) {
		const SDNode *User = Use.getUser();
		if (!User->isMachineOpcode())
		return false;

		// A VSE instruction doesn't have a merge operand, and doesn't
		// read past VL at all. That makes it a simple case to start with.
		const RISCVVPseudosTable::PseudoInfo *RVV =
		RISCVVPseudosTable::getPseudoInfo(User->getMachineOpcode());
		if (!RVV)
		return false;
		switch (RVV->BaseInstr) {
		default:
		return false;
		case RISCV::VSE8_V:
		case RISCV::VSE16_V:
		case RISCV::VSE32_V:
		case RISCV::VSE64_V:
		return true;
		}
		}

		// Analyzes users of a splat-like VMV/VFMV instruction and chooses the minimal
		// possible VL.
		SDNode RISCVDAGToDAGISel::tryShrinkVLForVMV(SDNode Node) {
		const RISCVInstrInfo &TII = *Subtarget->getInstrInfo();

		// FIXME: this can be profitable for the moves with multiple uses as well.
		if (!Node->hasOneUse())
		return Node;
		const SDNode::use_iterator UI = Node->use_begin();
		if (!allowsVLShrinking(UI.getUse()))
		return Node;

		const SDNode User = UI;
		const MCInstrDesc &UserMCID = TII.get(User->getMachineOpcode());

		// If SEW or LMUL differs, then VL values may not be comparable
		MVT VT = Node->getSimpleValueType(0);
		const unsigned SEW = VT.getScalarSizeInBits();
		const unsigned UserSEW = getSEWOp(User, UserMCID);
		if (SEW != UserSEW)
		return Node;

		RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
		RISCVII::VLMUL UserLMUL = RISCVII::getLMul(UserMCID.TSFlags);
		if (LMUL != UserLMUL)
		return Node;

		SDValue VL = getVLOperand(User, UserMCID);
		SDValue OldVL = Node->getOperand(Node->getNumOperands() - 1);
		if (!isVLLessThan(VL, OldVL))
		return Node;

		// MergeOp, Src, VL.
		SmallVector<SDValue, 3> Ops(Node->op_begin(), Node->op_end());
		Ops[Node->getNumOperands() - 1] = VL;
		return CurDAG->UpdateNodeOperands(Node, Ops);
		}

// Try to remove sext.w if the input is a W instruction or can be made into		// Try to remove sext.w if the input is a W instruction or can be made into
// a W instruction cheaply.		// a W instruction cheaply.
bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {		bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
// Look for the sext.w pattern, addiw rd, rs1, 0.		// Look for the sext.w pattern, addiw rd, rs1, 0.
if (N->getMachineOpcode() != RISCV::ADDIW \|\|		if (N->getMachineOpcode() != RISCV::ADDIW \|\|
!isNullConstant(N->getOperand(1)))		!isNullConstant(N->getOperand(1)))
return false;		return false;

▲ Show 20 Lines • Show All 360 Lines • Show Last 20 Lines

llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll

	Show First 20 Lines • Show All 675 Lines • ▼ Show 20 Lines
	define void @vector_init_vsetvli_N(i64 %N, double* %c) {			define void @vector_init_vsetvli_N(i64 %N, double* %c) {
	; CHECK-LABEL: vector_init_vsetvli_N:			; CHECK-LABEL: vector_init_vsetvli_N:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vsetvli a2, a0, e64, m1, ta, mu			; CHECK-NEXT: vsetvli a2, a0, e64, m1, ta, mu
	; CHECK-NEXT: blez a0, .LBB13_3			; CHECK-NEXT: blez a0, .LBB13_3
	; CHECK-NEXT: # %bb.1: # %for.body.preheader			; CHECK-NEXT: # %bb.1: # %for.body.preheader
	; CHECK-NEXT: li a3, 0			; CHECK-NEXT: li a3, 0
	; CHECK-NEXT: slli a4, a2, 3			; CHECK-NEXT: slli a4, a2, 3
	; CHECK-NEXT: vsetvli a5, zero, e64, m1, ta, ma
	; CHECK-NEXT: vmv.v.i v8, 0			; CHECK-NEXT: vmv.v.i v8, 0
	; CHECK-NEXT: .LBB13_2: # %for.body			; CHECK-NEXT: .LBB13_2: # %for.body
	; CHECK-NEXT: # =>This Inner Loop Header: Depth=1			; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma			; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
	; CHECK-NEXT: vse64.v v8, (a1)			; CHECK-NEXT: vse64.v v8, (a1)
	; CHECK-NEXT: add a3, a3, a2			; CHECK-NEXT: add a3, a3, a2
	; CHECK-NEXT: add a1, a1, a4			; CHECK-NEXT: add a1, a1, a4
	; CHECK-NEXT: blt a3, a0, .LBB13_2			; CHECK-NEXT: blt a3, a0, .LBB13_2
	Show All 19 Lines

	; Same as above, but VL is a hard coded constant (in the preheader)			; Same as above, but VL is a hard coded constant (in the preheader)
	define void @vector_init_vsetvli_fv(i64 %N, double* %c) {			define void @vector_init_vsetvli_fv(i64 %N, double* %c) {
	; CHECK-LABEL: vector_init_vsetvli_fv:			; CHECK-LABEL: vector_init_vsetvli_fv:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: li a2, 0			; CHECK-NEXT: li a2, 0
	; CHECK-NEXT: vsetivli a3, 4, e64, m1, ta, mu			; CHECK-NEXT: vsetivli a3, 4, e64, m1, ta, mu
	; CHECK-NEXT: slli a4, a3, 3			; CHECK-NEXT: slli a4, a3, 3
	; CHECK-NEXT: vsetvli a5, zero, e64, m1, ta, ma
	; CHECK-NEXT: vmv.v.i v8, 0			; CHECK-NEXT: vmv.v.i v8, 0
	; CHECK-NEXT: .LBB14_1: # %for.body			; CHECK-NEXT: .LBB14_1: # %for.body
	; CHECK-NEXT: # =>This Inner Loop Header: Depth=1			; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma			; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
	; CHECK-NEXT: vse64.v v8, (a1)			; CHECK-NEXT: vse64.v v8, (a1)
	; CHECK-NEXT: add a2, a2, a3			; CHECK-NEXT: add a2, a2, a3
	; CHECK-NEXT: add a1, a1, a4			; CHECK-NEXT: add a1, a1, a4
	; CHECK-NEXT: blt a2, a0, .LBB14_1			; CHECK-NEXT: blt a2, a0, .LBB14_1
	; CHECK-NEXT: # %bb.2: # %for.end			; CHECK-NEXT: # %bb.2: # %for.end
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%0 = tail call i64 @llvm.riscv.vsetvli(i64 4, i64 3, i64 0)			%0 = tail call i64 @llvm.riscv.vsetvli(i64 4, i64 3, i64 0)
	Show All 13 Lines
	}			}

	; Same as above, but result of vsetvli in preheader isn't used, and			; Same as above, but result of vsetvli in preheader isn't used, and
	; constant is repeated in loop			; constant is repeated in loop
	define void @vector_init_vsetvli_fv2(i64 %N, double* %c) {			define void @vector_init_vsetvli_fv2(i64 %N, double* %c) {
	; CHECK-LABEL: vector_init_vsetvli_fv2:			; CHECK-LABEL: vector_init_vsetvli_fv2:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: li a2, 0			; CHECK-NEXT: li a2, 0
	; CHECK-NEXT: vsetvli a3, zero, e64, m1, ta, ma			; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, mu
	; CHECK-NEXT: vmv.v.i v8, 0			; CHECK-NEXT: vmv.v.i v8, 0
	; CHECK-NEXT: .LBB15_1: # %for.body			; CHECK-NEXT: .LBB15_1: # %for.body
	; CHECK-NEXT: # =>This Inner Loop Header: Depth=1			; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma			; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
	; CHECK-NEXT: vse64.v v8, (a1)			; CHECK-NEXT: vse64.v v8, (a1)
	; CHECK-NEXT: addi a2, a2, 4			; CHECK-NEXT: addi a2, a2, 4
	; CHECK-NEXT: addi a1, a1, 32			; CHECK-NEXT: addi a1, a1, 32
	; CHECK-NEXT: blt a2, a0, .LBB15_1			; CHECK-NEXT: blt a2, a0, .LBB15_1
	; CHECK-NEXT: # %bb.2: # %for.end			; CHECK-NEXT: # %bb.2: # %for.end
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	tail call i64 @llvm.riscv.vsetvli(i64 4, i64 3, i64 0)			tail call i64 @llvm.riscv.vsetvli(i64 4, i64 3, i64 0)
	Show All 13 Lines
	}			}

	; Same as above, but AVL is only specified on the store intrinsic			; Same as above, but AVL is only specified on the store intrinsic
	; This case will require some form of hoisting or PRE			; This case will require some form of hoisting or PRE
	define void @vector_init_vsetvli_fv3(i64 %N, double* %c) {			define void @vector_init_vsetvli_fv3(i64 %N, double* %c) {
	; CHECK-LABEL: vector_init_vsetvli_fv3:			; CHECK-LABEL: vector_init_vsetvli_fv3:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: li a2, 0			; CHECK-NEXT: li a2, 0
	; CHECK-NEXT: vsetvli a3, zero, e64, m1, ta, ma			; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma
	; CHECK-NEXT: vmv.v.i v8, 0			; CHECK-NEXT: vmv.v.i v8, 0
	; CHECK-NEXT: .LBB16_1: # %for.body			; CHECK-NEXT: .LBB16_1: # %for.body
	; CHECK-NEXT: # =>This Inner Loop Header: Depth=1			; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma
	; CHECK-NEXT: vse64.v v8, (a1)			; CHECK-NEXT: vse64.v v8, (a1)
	; CHECK-NEXT: addi a2, a2, 4			; CHECK-NEXT: addi a2, a2, 4
	; CHECK-NEXT: addi a1, a1, 32			; CHECK-NEXT: addi a1, a1, 32
	; CHECK-NEXT: blt a2, a0, .LBB16_1			; CHECK-NEXT: blt a2, a0, .LBB16_1
	; CHECK-NEXT: # %bb.2: # %for.end			; CHECK-NEXT: # %bb.2: # %for.end
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	br label %for.body			br label %for.body
	▲ Show 20 Lines • Show All 170 Lines • Show Last 20 Lines