This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Handle VECTOR_SHUFFL mask with splats
AbandonedPublic

Authored by jaykang10 on May 2 2023, 1:57 AM.

Download Raw Diff

Details

Reviewers

efriedma
dmgreen
t.p.northover

Summary

As discussed on https://reviews.llvm.org/D148347, we could handle the vector shuffle mask with splats more efficiently with dup.

Diff Detail

Event Timeline

jaykang10 created this revision.May 2 2023, 1:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 2 2023, 1:57 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

jaykang10 requested review of this revision.May 2 2023, 1:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 2 2023, 1:57 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B229381: Diff 518651.May 2 2023, 2:13 AM

When I was mentioning the testcases in D149638, I was think of cases like test11 in build-vector-two-dup.ll. For values already in vector registers, replacing a tbl with three shuffle instructions probably isn't an improvement (particularly on newer cores where tbl is fast).

In D149638#4317332, @efriedma wrote:

When I was mentioning the testcases in D149638, I was think of cases like test11 in build-vector-two-dup.ll. For values already in vector registers, replacing a tbl with three shuffle instructions probably isn't an improvement (particularly on newer cores where tbl is fast).

Thanks for kind comment.
From the diff of the test output, I was not sure this transformation is useful even though it does not use constant pool... As you mentioned, in loop, the constant pool load could be hoisted...
Let me close this patch.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

135 lines

test/

CodeGen/

AArch64/

arm64-swizzle-tbl-i16-layout.ll

34 lines

shuffles.ll

32 lines

Diff 518651

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 11,507 Lines • ▼ Show 20 Lines	static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
SDValue ID =		SDValue ID =
DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);		DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,		return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
{ID, Tbl1->getOperand(1), Tbl1->getOperand(2),		{ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});		Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
}		}

		static SDValue tryHandleMaskWithSplats(SDValue Op, SelectionDAG &DAG) {
		SDLoc dl(Op);
		EVT VT = Op.getValueType();
		SDValue V0 = Op.getOperand(0);
		SDValue V1 = Op.getOperand(1);
		int NumElts = VT.getVectorNumElements();
		ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

		// DifferentLaneMap keeps <lane number, consecutive lane count> on mask.
		SmallMapVector<int, int, 16> DifferentLaneMap;
		int ConsecutiveLaneCount = 0;
		int PrevLane = -1;
		for (int i = 0; i < NumElts; ++i) {
		int Lane = Mask[i];
		if (PrevLane != Lane) {
		ConsecutiveLaneCount = 0;
		PrevLane = Lane;
		}

		if (Lane == UndefMaskElem)
		return SDValue();

		// Keep different lane and its consecutive count.
		DifferentLaneMap[Lane] = ++ConsecutiveLaneCount;
		}

		int NumDifferentLanes = DifferentLaneMap.size();
		if (NumDifferentLanes == 2) {
		SmallVector<std::pair<SDValue, SDValue>, 2> Lanes;
		bool canUseVECTOR_CONCAT = true;
		for (auto Pair : DifferentLaneMap) {
		// Check all different lanes have same length.
		if (Pair.second != NumElts / NumDifferentLanes)
		canUseVECTOR_CONCAT = false;
		// Keep source vector and its lane.
		SDValue SrcVec = Pair.first < NumElts ? V0 : V1;
		int Lane = Pair.first < NumElts ? Pair.first : Pair.first - NumElts;
		Lanes.push_back(
		std::make_pair(SrcVec, DAG.getConstant(Lane, dl, MVT::i64)));
		}

		// If the mask consists of two splats which have same length, try to
		// generate DUPs and concat_vectors. For example,
		//
		// t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
		// t4: v8i16,ch = CopyFromReg t0, Register:v8i16 %1
		// t5: v8i16 = vector_shuffle<0,0,0,0,8,8,8,8> t2, t4
		// ==>
		// t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0
		// t12: v4i16 = AArch64ISD::DUPLANE16 t2, Constant:i64<0>
		// t4: v8i16,ch = CopyFromReg t0, Register:v8i16 %1
		// t13: v4i16 = AArch64ISD::DUPLANE16 t4, Constant:i64<0>
		// t14: v8i16 = concat_vectors t12, t13
		if (canUseVECTOR_CONCAT) {
		EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
		if (DAG.getTargetLoweringInfo().isTypeLegal(SubVT) && SubVT.isVector() &&
		SubVT.getVectorNumElements() >= 4) {
		unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
		SDValue DUP1 =
		DAG.getNode(Opcode, dl, SubVT, Lanes[0].first, Lanes[0].second);
		SDValue DUP2 =
		DAG.getNode(Opcode, dl, SubVT, Lanes[1].first, Lanes[1].second);
		SDValue CONCAT_VECTORS =
		DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
		return CONCAT_VECTORS;
		}
		}
		}

		// From here, DifferentLaneMap keeps <lane number, lane count> on mask.
		DifferentLaneMap.clear();

		for (int i = 0; i < NumElts; ++i) {
		int Lane = Mask[i];
		if (Lane == UndefMaskElem)
		return SDValue();
		++DifferentLaneMap[Lane];
		}

		int DUPCandidateLane = -1;
		int DUPLaneCount = 0;
		for (auto Pair : DifferentLaneMap) {
		if (Pair.second > DUPLaneCount) {
		DUPCandidateLane = Pair.first;
		DUPLaneCount = Pair.second;
		}
		}

		// Let's try to generate DUP and ins. For example,
		//
		// t2: v8f16,ch = CopyFromReg t0, Register:v8f16 %0
		// t4: v8f16,ch = CopyFromReg t0, Register:v8f16 %1
		// t5: v8f16 = vector_shuffle<0,0,0,0,0,8,1,15> t2, t4
		// ==>
		// t2: v8f16,ch = CopyFromReg t0, Register:v8f16 %0
		// t4: v8f16,ch = CopyFromReg t0, Register:v8f16 %1
		// t12: f16 = extract_vector_elt t2, Constant:i64<0>
		// t13: v8f16 = AArch64ISD::DUP t12
		// t14: f16 = extract_vector_elt t4, Constant:i64<0>
		// t16: v8f16 = insert_vector_elt t13, t14, Constant:i64<5>
		// t18: f16 = extract_vector_elt t2, Constant:i64<1>
		// t20: v8f16 = insert_vector_elt t16, t18, Constant:i64<6>
		// t22: f16 = extract_vector_elt t4, Constant:i64<7>
		// t23: v8f16 = insert_vector_elt t20, t22, Constant:i64<7>
		if (DUPLaneCount > NumElts / 2 &&
		DAG.getTargetLoweringInfo().isTypeLegal(VT.getVectorElementType())) {
		// Create DUP.
		int SrcLane = (DUPCandidateLane >= NumElts) ? DUPCandidateLane - NumElts
		: DUPCandidateLane;
		SDValue SrcVec = (DUPCandidateLane >= NumElts) ? V1 : V0;
		SDValue SrcElt =
		DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(),
		SrcVec, DAG.getConstant(SrcLane, dl, MVT::i64));
		SDValue NewVec = DAG.getNode(AArch64ISD::DUP, dl, VT, SrcElt);
		// Create ins.
		for (int i = 0; i < NumElts; ++i) {
		if (Mask[i] != DUPCandidateLane) {
		SrcLane = (Mask[i] >= NumElts) ? Mask[i] - NumElts : Mask[i];
		SDValue SrcVec = (Mask[i] >= NumElts) ? V1 : V0;
		SDValue SrcElt =
		DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(),
		SrcVec, DAG.getConstant(SrcLane, dl, MVT::i64));
		NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVec, SrcElt,
		DAG.getConstant(i, dl, MVT::i64));
		}
		}
		return NewVec;
		}

		return SDValue();
		}

// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,		// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
// but we don't have an appropriate instruction,		// but we don't have an appropriate instruction,
// so custom-lower it as ZIP1-with-zeros.		// so custom-lower it as ZIP1-with-zeros.
SDValue		SDValue
AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,		AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc dl(Op);		SDLoc dl(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines	if (NumElts == 4) {
// Compute the index in the perfect shuffle table.		// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +		unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];		PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];		unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,		return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
dl);		dl);
}		}

		if (SDValue NewSD = tryHandleMaskWithSplats(Op, DAG))
		return NewSD;

return GenerateTBL(Op, ShuffleMask, DAG);		return GenerateTBL(Op, ShuffleMask, DAG);
}		}

SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,		SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();

if (useSVEForFixedLengthVectorVT(VT,		if (useSVEForFixedLengthVectorVT(VT,
▲ Show 20 Lines • Show All 13,187 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
	; RUN: llc < %s -mtriple=arm64-apple-ios7.0 \| FileCheck %s			; RUN: llc < %s -mtriple=arm64-apple-ios7.0 \| FileCheck %s
	; rdar://13214163 - Make sure we generate a correct lookup table for the TBL			; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
	; instruction when the element size of the vector is not 8 bits. We were			; instruction when the element size of the vector is not 8 bits. We were
	; getting both the endianness wrong and the element indexing wrong.			; getting both the endianness wrong and the element indexing wrong.
	define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {			define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
	; CHECK: .section __TEXT,__literal16,16byte_literals			; CHECK-LABEL: foo:
	; CHECK: .p2align 4			; CHECK: ; %bb.0:
	; CHECK:lCPI0_0:			; CHECK-NEXT: dup.4h v1, v0[4]
	; CHECK: .byte 0 ; 0x0			; CHECK-NEXT: dup.4h v0, v0[0]
	; CHECK: .byte 1 ; 0x1			; CHECK-NEXT: mov.d v0[1], v1[0]
	; CHECK: .byte 0 ; 0x0			; CHECK-NEXT: ret
	; CHECK: .byte 1 ; 0x1
	; CHECK: .byte 0 ; 0x0
	; CHECK: .byte 1 ; 0x1
	; CHECK: .byte 0 ; 0x0
	; CHECK: .byte 1 ; 0x1
	; CHECK: .byte 8 ; 0x8
	; CHECK: .byte 9 ; 0x9
	; CHECK: .byte 8 ; 0x8
	; CHECK: .byte 9 ; 0x9
	; CHECK: .byte 8 ; 0x8
	; CHECK: .byte 9 ; 0x9
	; CHECK: .byte 8 ; 0x8
	; CHECK: .byte 9 ; 0x9
	; CHECK: .section __TEXT,__text,regular,pure_instructions
	; CHECK: .globl _foo
	; CHECK: .p2align 2
	; CHECK:_foo: ; @foo
	; CHECK: adrp [[BASE:x[0-9]+]], lCPI0_0@PAGE
	; CHECK: ldr q[[REG:[0-9]+]], [[[BASE]], lCPI0_0@PAGEOFF]
	; CHECK: tbl.16b v0, { v0 }, v[[REG]]
	; CHECK: ret

	%val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>			%val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %val			ret <8 x i16> %val
	}			}

llvm/test/CodeGen/AArch64/shuffles.ll

	Show First 20 Lines • Show All 164 Lines • ▼ Show 20 Lines
	{			{
	%r = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8>			%r = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8>
	ret <8 x i8> %r			ret <8 x i8> %r
	}			}

	define <8 x i16> @test_shuf9(<8 x i16> %a, <8 x i16> %b)			define <8 x i16> @test_shuf9(<8 x i16> %a, <8 x i16> %b)
	; CHECK-LABEL: test_shuf9:			; CHECK-LABEL: test_shuf9:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI13_0			; CHECK-NEXT: dup v1.4h, v1.h[0]
	; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1			; CHECK-NEXT: dup v0.4h, v0.h[0]
	; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1			; CHECK-NEXT: mov v0.d[1], v1.d[0]
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
	; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	{			{
	%r = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8>			%r = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8>
	ret <8 x i16> %r			ret <8 x i16> %r
	}			}

	define <16 x i8> @test_shuf10(<16 x i8> %a, <16 x i8> %b)			define <16 x i8> @test_shuf10(<16 x i8> %a, <16 x i8> %b)
	; CHECK-LABEL: test_shuf10:			; CHECK-LABEL: test_shuf10:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI14_0			; CHECK-NEXT: dup v1.8b, v0.b[8]
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]			; CHECK-NEXT: dup v0.8b, v0.b[0]
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b			; CHECK-NEXT: mov v0.d[1], v1.d[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	{			{
	%r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 >			%r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 >
	ret <16 x i8> %r			ret <16 x i8> %r
	}			}

	define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)			define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)
	; CHECK-LABEL: test_shuf11:			; CHECK-LABEL: test_shuf11:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI15_0			; CHECK-NEXT: dup v1.4h, v1.h[0]
	; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1			; CHECK-NEXT: dup v0.4h, v0.h[0]
	; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1			; CHECK-NEXT: mov v0.d[1], v1.d[0]
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0]
	; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	{			{
	%r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8>			%r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8>
	ret <8 x half> %r			ret <8 x half> %r
	}			}

	define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)			define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
	; CHECK-LABEL: test_shuf12:			; CHECK-LABEL: test_shuf12:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI16_0			; CHECK-NEXT: dup v2.8h, v0.h[0]
	; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1			; CHECK-NEXT: mov v2.h[5], v1.h[0]
	; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1			; CHECK-NEXT: mov v2.h[6], v0.h[1]
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]			; CHECK-NEXT: mov v2.h[7], v1.h[7]
	; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b			; CHECK-NEXT: mov v0.16b, v2.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	{			{
	%r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 1, i32 15>			%r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 1, i32 15>
	ret <8 x half> %r			ret <8 x half> %r
	}			}

	define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)			define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
	; CHECK-LABEL: test_shuf13:			; CHECK-LABEL: test_shuf13:
	Show All 39 Lines