This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
1/3
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
aarch64-matrix-umull-smull.ll

Differential D126449

[AArch64] Reuse larger DUP if available
ClosedPublic

Authored by dmgreen on May 26 2022, 12:32 AM.

Download Raw Diff

Details

Reviewers

SjoerdMeijer
samtebbs
jaykang10
sdesmalen

Commits

rG9a3144d07838: [AArch64] Reuse larger DUP if available

Summary

If both a v2i32 DUP(x) and a v4i32 DUP(x) node exists, we can re-use the larger node using a vector extract to obtain the smaller. This comes up in the smull/smlal code, but needs a small fixup to allow the smull2 code in tryExtendDUPToExtractHigh/performAddSubLongCombine to still match smull2 extracts.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

dmgreen created this revision.May 26 2022, 12:32 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 26 2022, 12:32 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

dmgreen requested review of this revision.May 26 2022, 12:32 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 26 2022, 12:32 AM

Harbormaster completed remote builds in B166423: Diff 432205.May 26 2022, 1:15 AM

samtebbs added inline comments.May 26 2022, 1:41 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
18245	In `tryExtendDUPToExtractHigh` above, the `EXTRACT_SUBVECTOR` gets `NumElems` as the constant, but this one gets 0. Why is it 0 in this case? Seems like an odd number to give `EXTRACT_SUBVECTOR`.

dmgreen added inline comments.May 26 2022, 2:20 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
18245	It means "extract the bottom lanes". As in - we start from lane 0. All the lanes are equal in a dup, but it is the bottom ones that are free to extract from. The NumElems in tryExtendDUPToExtractHigh is extracting the high half, because NumElems is the number of lanes in the 64bit vector. So it extracts the high 64bits from a 128bit vector and can produce a smull2 instruction as a result.

LGTM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
18245	Great explanation. Thanks.

This revision is now accepted and ready to land.May 26 2022, 2:35 AM

This revision was landed with ongoing or failed builds.May 29 2022, 11:42 AM

Closed by commit rG9a3144d07838: [AArch64] Reuse larger DUP if available (authored by dmgreen). · Explain Why

This revision was automatically updated to reflect the committed changes.

dmgreen added a commit: rG9a3144d07838: [AArch64] Reuse larger DUP if available.

dmgreen mentioned this in D128144: [AArch64] Known bits for AArch64ISD::DUP.Jun 21 2022, 12:42 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

44 lines

test/

CodeGen/

AArch64/

aarch64-matrix-umull-smull.ll

30 lines

Diff 432801

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 15,141 Lines • ▼ Show 20 Lines
//		//
// (dupv64 scalar) --> (extract_high (dup128 scalar))		// (dupv64 scalar) --> (extract_high (dup128 scalar))
//		//
// This routine does the actual conversion of such DUPs, once outer routines		// This routine does the actual conversion of such DUPs, once outer routines
// have determined that everything else is in order.		// have determined that everything else is in order.
// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold		// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
// similarly here.		// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {		static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
		MVT VT = N.getSimpleValueType();
		if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
		N.getConstantOperandVal(1) == 0)
		N = N.getOperand(0);

switch (N.getOpcode()) {		switch (N.getOpcode()) {
case AArch64ISD::DUP:		case AArch64ISD::DUP:
case AArch64ISD::DUPLANE8:		case AArch64ISD::DUPLANE8:
case AArch64ISD::DUPLANE16:		case AArch64ISD::DUPLANE16:
case AArch64ISD::DUPLANE32:		case AArch64ISD::DUPLANE32:
case AArch64ISD::DUPLANE64:		case AArch64ISD::DUPLANE64:
case AArch64ISD::MOVI:		case AArch64ISD::MOVI:
case AArch64ISD::MOVIshift:		case AArch64ISD::MOVIshift:
case AArch64ISD::MOVIedit:		case AArch64ISD::MOVIedit:
case AArch64ISD::MOVImsl:		case AArch64ISD::MOVImsl:
case AArch64ISD::MVNIshift:		case AArch64ISD::MVNIshift:
case AArch64ISD::MVNImsl:		case AArch64ISD::MVNImsl:
break;		break;
default:		default:
// FMOV could be supported, but isn't very useful, as it would only occur		// FMOV could be supported, but isn't very useful, as it would only occur
// if you passed a bitcast' floating point immediate to an eligible long		// if you passed a bitcast' floating point immediate to an eligible long
// integer op (addl, smull, ...).		// integer op (addl, smull, ...).
return SDValue();		return SDValue();
}		}

MVT NarrowTy = N.getSimpleValueType();		if (!VT.is64BitVector())
if (!NarrowTy.is64BitVector())
return SDValue();		return SDValue();

MVT ElementTy = NarrowTy.getVectorElementType();		SDLoc DL(N);
unsigned NumElems = NarrowTy.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();
		if (N.getValueType().is64BitVector()) {
		MVT ElementTy = VT.getVectorElementType();
MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);		MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
		N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
		}

SDLoc dl(N);		return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,		DAG.getConstant(NumElems, DL, MVT::i64));
DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
DAG.getConstant(NumElems, dl, MVT::i64));
}		}

static bool isEssentiallyExtractHighSubvector(SDValue N) {		static bool isEssentiallyExtractHighSubvector(SDValue N) {
if (N.getOpcode() == ISD::BITCAST)		if (N.getOpcode() == ISD::BITCAST)
N = N.getOperand(0);		N = N.getOperand(0);
if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)		if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;		return false;
if (N.getOperand(0).getValueType().isScalableVector())		if (N.getOperand(0).getValueType().isScalableVector())
▲ Show 20 Lines • Show All 3,030 Lines • ▼ Show 20 Lines	static SDValue performSelectCombine(SDNode *N,
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);		SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);		SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
Mask = DAG.getNode(ISD::BITCAST, DL,		Mask = DAG.getNode(ISD::BITCAST, DL,
ResVT.changeVectorElementTypeToInteger(), Mask);		ResVT.changeVectorElementTypeToInteger(), Mask);

return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));		return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}		}

		static SDValue performDUPCombine(SDNode *N,
		TargetLowering::DAGCombinerInfo &DCI) {
		EVT VT = N->getValueType(0);
		// If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
		// 128bit vector version.
		if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
		EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
		if (SDNode *LN = DCI.DAG.getNodeIfExists(
		N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
		SDLoc DL(N);
		return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
		DCI.DAG.getConstant(0, DL, MVT::i64));
		samtebbsUnsubmitted Not Done Reply Inline Actions In `tryExtendDUPToExtractHigh` above, the `EXTRACT_SUBVECTOR` gets `NumElems` as the constant, but this one gets 0. Why is it 0 in this case? Seems like an odd number to give `EXTRACT_SUBVECTOR`. samtebbs: In `tryExtendDUPToExtractHigh` above, the `EXTRACT_SUBVECTOR` gets `NumElems` as the constant…
		dmgreenAuthorUnsubmitted Done Reply Inline Actions It means "extract the bottom lanes". As in - we start from lane 0. All the lanes are equal in a dup, but it is the bottom ones that are free to extract from. The NumElems in tryExtendDUPToExtractHigh is extracting the high half, because NumElems is the number of lanes in the 64bit vector. So it extracts the high 64bits from a 128bit vector and can produce a smull2 instruction as a result. dmgreen: It means "extract the bottom lanes". As in - we start from lane 0. All the lanes are equal in a…
		samtebbsUnsubmitted Not Done Reply Inline Actions Great explanation. Thanks. samtebbs: Great explanation. Thanks.
		}
		}

		return performPostLD1Combine(N, DCI, false);
		}

/// Get rid of unnecessary NVCASTs (that don't change the type).		/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {		static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())		if (N->getValueType(0) == N->getOperand(0).getValueType())
return N->getOperand(0);		return N->getOperand(0);

return SDValue();		return SDValue();
}		}

▲ Show 20 Lines • Show All 707 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::BRCOND:		case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);		return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:		case AArch64ISD::TBNZ:
case AArch64ISD::TBZ:		case AArch64ISD::TBZ:
return performTBZCombine(N, DCI, DAG);		return performTBZCombine(N, DCI, DAG);
case AArch64ISD::CSEL:		case AArch64ISD::CSEL:
return performCSELCombine(N, DCI, DAG);		return performCSELCombine(N, DCI, DAG);
case AArch64ISD::DUP:		case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);		return performDUPCombine(N, DCI);
case AArch64ISD::NVCAST:		case AArch64ISD::NVCAST:
return performNVCASTCombine(N);		return performNVCASTCombine(N);
case AArch64ISD::SPLICE:		case AArch64ISD::SPLICE:
return performSpliceCombine(N, DAG);		return performSpliceCombine(N, DAG);
case AArch64ISD::UUNPKLO:		case AArch64ISD::UUNPKLO:
case AArch64ISD::UUNPKHI:		case AArch64ISD::UUNPKHI:
return performUnpackCombine(N, DAG);		return performUnpackCombine(N, DAG);
case AArch64ISD::UZP1:		case AArch64ISD::UZP1:
▲ Show 20 Lines • Show All 2,289 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

	Show First 20 Lines • Show All 195 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: mov w9, w3			; CHECK-NEXT: mov w9, w3
	; CHECK-NEXT: cmp w3, #15			; CHECK-NEXT: cmp w3, #15
	; CHECK-NEXT: b.hi .LBB3_3			; CHECK-NEXT: b.hi .LBB3_3
	; CHECK-NEXT: // %bb.2:			; CHECK-NEXT: // %bb.2:
	; CHECK-NEXT: mov x10, xzr			; CHECK-NEXT: mov x10, xzr
	; CHECK-NEXT: b .LBB3_6			; CHECK-NEXT: b .LBB3_6
	; CHECK-NEXT: .LBB3_3: // %vector.ph			; CHECK-NEXT: .LBB3_3: // %vector.ph
	; CHECK-NEXT: and x10, x9, #0xfffffff0			; CHECK-NEXT: and x10, x9, #0xfffffff0
	; CHECK-NEXT: dup v0.4h, w8
	; CHECK-NEXT: add x11, x2, #32			; CHECK-NEXT: add x11, x2, #32
	; CHECK-NEXT: add x12, x0, #16			; CHECK-NEXT: add x12, x0, #16
	; CHECK-NEXT: mov x13, x10			; CHECK-NEXT: mov x13, x10
	; CHECK-NEXT: dup v1.8h, w8			; CHECK-NEXT: dup v0.8h, w8
	; CHECK-NEXT: .LBB3_4: // %vector.body			; CHECK-NEXT: .LBB3_4: // %vector.body
	; CHECK-NEXT: // =>This Inner Loop Header: Depth=1			; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: ldp q2, q3, [x12, #-16]			; CHECK-NEXT: ldp q1, q2, [x12, #-16]
	; CHECK-NEXT: subs x13, x13, #16			; CHECK-NEXT: subs x13, x13, #16
	; CHECK-NEXT: add x12, x12, #32			; CHECK-NEXT: add x12, x12, #32
	; CHECK-NEXT: smull2 v4.4s, v1.8h, v2.8h			; CHECK-NEXT: smull2 v3.4s, v0.8h, v1.8h
				; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h
				; CHECK-NEXT: smull2 v4.4s, v0.8h, v2.8h
	; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h			; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h
	; CHECK-NEXT: smull2 v5.4s, v1.8h, v3.8h			; CHECK-NEXT: stp q1, q3, [x11, #-32]
	; CHECK-NEXT: smull v3.4s, v0.4h, v3.4h			; CHECK-NEXT: stp q2, q4, [x11], #64
	; CHECK-NEXT: stp q2, q4, [x11, #-32]
	; CHECK-NEXT: stp q3, q5, [x11], #64
	; CHECK-NEXT: b.ne .LBB3_4			; CHECK-NEXT: b.ne .LBB3_4
	; CHECK-NEXT: // %bb.5: // %middle.block			; CHECK-NEXT: // %bb.5: // %middle.block
	; CHECK-NEXT: cmp x10, x9			; CHECK-NEXT: cmp x10, x9
	; CHECK-NEXT: b.eq .LBB3_8			; CHECK-NEXT: b.eq .LBB3_8
	; CHECK-NEXT: .LBB3_6: // %for.body.preheader1			; CHECK-NEXT: .LBB3_6: // %for.body.preheader1
	; CHECK-NEXT: sub x9, x9, x10			; CHECK-NEXT: sub x9, x9, x10
	; CHECK-NEXT: add x11, x2, x10, lsl #2			; CHECK-NEXT: add x11, x2, x10, lsl #2
	; CHECK-NEXT: add x10, x0, x10, lsl #1			; CHECK-NEXT: add x10, x0, x10, lsl #1
	▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: mov w9, w3			; CHECK-NEXT: mov w9, w3
	; CHECK-NEXT: cmp w3, #15			; CHECK-NEXT: cmp w3, #15
	; CHECK-NEXT: b.hi .LBB4_3			; CHECK-NEXT: b.hi .LBB4_3
	; CHECK-NEXT: // %bb.2:			; CHECK-NEXT: // %bb.2:
	; CHECK-NEXT: mov x10, xzr			; CHECK-NEXT: mov x10, xzr
	; CHECK-NEXT: b .LBB4_6			; CHECK-NEXT: b .LBB4_6
	; CHECK-NEXT: .LBB4_3: // %vector.ph			; CHECK-NEXT: .LBB4_3: // %vector.ph
	; CHECK-NEXT: and x10, x9, #0xfffffff0			; CHECK-NEXT: and x10, x9, #0xfffffff0
	; CHECK-NEXT: dup v0.4h, w8
	; CHECK-NEXT: add x11, x2, #32			; CHECK-NEXT: add x11, x2, #32
	; CHECK-NEXT: add x12, x0, #16			; CHECK-NEXT: add x12, x0, #16
	; CHECK-NEXT: mov x13, x10			; CHECK-NEXT: mov x13, x10
	; CHECK-NEXT: dup v1.8h, w8			; CHECK-NEXT: dup v0.8h, w8
	; CHECK-NEXT: .LBB4_4: // %vector.body			; CHECK-NEXT: .LBB4_4: // %vector.body
	; CHECK-NEXT: // =>This Inner Loop Header: Depth=1			; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: ldp q2, q3, [x12, #-16]			; CHECK-NEXT: ldp q1, q2, [x12, #-16]
	; CHECK-NEXT: subs x13, x13, #16			; CHECK-NEXT: subs x13, x13, #16
	; CHECK-NEXT: add x12, x12, #32			; CHECK-NEXT: add x12, x12, #32
	; CHECK-NEXT: umull2 v4.4s, v1.8h, v2.8h			; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h
				; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
				; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h
	; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h			; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
	; CHECK-NEXT: umull2 v5.4s, v1.8h, v3.8h			; CHECK-NEXT: stp q1, q3, [x11, #-32]
	; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h			; CHECK-NEXT: stp q2, q4, [x11], #64
	; CHECK-NEXT: stp q2, q4, [x11, #-32]
	; CHECK-NEXT: stp q3, q5, [x11], #64
	; CHECK-NEXT: b.ne .LBB4_4			; CHECK-NEXT: b.ne .LBB4_4
	; CHECK-NEXT: // %bb.5: // %middle.block			; CHECK-NEXT: // %bb.5: // %middle.block
	; CHECK-NEXT: cmp x10, x9			; CHECK-NEXT: cmp x10, x9
	; CHECK-NEXT: b.eq .LBB4_8			; CHECK-NEXT: b.eq .LBB4_8
	; CHECK-NEXT: .LBB4_6: // %for.body.preheader1			; CHECK-NEXT: .LBB4_6: // %for.body.preheader1
	; CHECK-NEXT: sub x9, x9, x10			; CHECK-NEXT: sub x9, x9, x10
	; CHECK-NEXT: add x11, x2, x10, lsl #2			; CHECK-NEXT: add x11, x2, x10, lsl #2
	; CHECK-NEXT: add x10, x0, x10, lsl #1			; CHECK-NEXT: add x10, x0, x10, lsl #1
	▲ Show 20 Lines • Show All 193 Lines • Show Last 20 Lines