This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
1/3
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
named-vector-shuffle-reverse-neon.ll
-
neon-reverseshuffle.patch

Differential D100882

[AArch64] Improve vector reverse lowering
ClosedPublic

Authored by dmgreen on Apr 20 2021, 12:57 PM.

Download Raw Diff

Details

Reviewers

RKSimon
fhahn
ABataev
sdesmalen
david-arm

Commits

rGc0bf5929eea7: [AArch64] Improve vector reverse lowering

Summary

This improves the lowering of v8i16 and v16i8 vector reverse shuffles. Instead of going via a generic tbl it uses a rev64; ext pair, as already happens for v4i32.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

dmgreen created this revision.Apr 20 2021, 12:57 PM

Herald added subscribers: danielkiss, hiraditya, kristof.beyls. · View Herald TranscriptApr 20 2021, 12:57 PM

dmgreen requested review of this revision.Apr 20 2021, 12:57 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 20 2021, 12:57 PM

Harbormaster completed remote builds in B99786: Diff 338967.Apr 20 2021, 1:08 PM

LGTM!

This revision is now accepted and ready to land.Apr 21 2021, 1:03 AM

sdesmalen added inline comments.Apr 21 2021, 1:44 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
9060	nit: Is this condition necessary? I know for LLVM IR nodes the result type doesn't necessarily have the same number of elements as the source vectors (but instead equals the number of elements in the mask), but is the same true for VECTOR_SHUFFLE? The reason for asking is that I see in ISDOpcodes that it says: /// VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as /// VEC1/VEC2.

Thanks!

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
9060	Yeah, I added it as an additional safety check. It didn't alter any of the test cases I had, but I figured it was better safe than sorry. I can remove it though, if it is guaranteed that they will already match in size.

sdesmalen added inline comments.Apr 21 2021, 4:36 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
9060	I think the sizes are supposed to be the same, looking at `SelectionDAGBuilder::visitShuffleVector` there is code that ensures the sizes match. To be sure, maybe you can add an assert instead?

Matt added a subscriber: Matt.Apr 21 2021, 6:19 AM

Now with extra asserts.

Harbormaster completed remote builds in B100024: Diff 339287.Apr 21 2021, 11:28 AM

This revision was landed with ongoing or failed builds.Apr 22 2021, 1:01 PM

Closed by commit rGc0bf5929eea7: [AArch64] Improve vector reverse lowering (authored by dmgreen). · Explain Why

This revision was automatically updated to reflect the committed changes.

dmgreen added a commit: rGc0bf5929eea7: [AArch64] Improve vector reverse lowering.

Herald added a subscriber: tmatheson. · View Herald TranscriptApr 22 2021, 1:01 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

12 lines

test/

CodeGen/

AArch64/

named-vector-shuffle-reverse-neon.ll

66 lines

neon-reverseshuffle.patch

15 lines

Diff 339763

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,001 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
// DAG nodes, instead of keeping them as shuffles and matching them again		// DAG nodes, instead of keeping them as shuffles and matching them again
// during code selection. This is more efficient and avoids the possibility		// during code selection. This is more efficient and avoids the possibility
// of inconsistencies between legalization and selection.		// of inconsistencies between legalization and selection.
ArrayRef<int> ShuffleMask = SVN->getMask();		ArrayRef<int> ShuffleMask = SVN->getMask();

SDValue V1 = Op.getOperand(0);		SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);		SDValue V2 = Op.getOperand(1);

		assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
		assert(ShuffleMask.size() == VT.getVectorNumElements() &&
		"Unexpected VECTOR_SHUFFLE mask size!");

if (SVN->isSplat()) {		if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();		int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.		// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1)		if (Lane == -1)
Lane = 0;		Lane = 0;

if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)		if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),		return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
Show All 30 Lines	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

if (isREVMask(ShuffleMask, VT, 64))		if (isREVMask(ShuffleMask, VT, 64))
return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);		return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 32))		if (isREVMask(ShuffleMask, VT, 32))
return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);		return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 16))		if (isREVMask(ShuffleMask, VT, 16))
return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);		return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

		if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) \|\|
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: Is this condition necessary? I know for LLVM IR nodes the result type doesn't necessarily have the same number of elements as the source vectors (but instead equals the number of elements in the mask), but is the same true for VECTOR_SHUFFLE? The reason for asking is that I see in ISDOpcodes that it says: /// VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as /// VEC1/VEC2. sdesmalen: nit: Is this condition necessary? I know for LLVM IR nodes the result type doesn't necessarily…
		dmgreenAuthorUnsubmitted Done Reply Inline Actions Yeah, I added it as an additional safety check. It didn't alter any of the test cases I had, but I figured it was better safe than sorry. I can remove it though, if it is guaranteed that they will already match in size. dmgreen: Yeah, I added it as an additional safety check. It didn't alter any of the test cases I had…
		sdesmalenUnsubmitted Not Done Reply Inline Actions I think the sizes are supposed to be the same, looking at `SelectionDAGBuilder::visitShuffleVector` there is code that ensures the sizes match. To be sure, maybe you can add an assert instead? sdesmalen: I think the sizes are supposed to be the same, looking at `SelectionDAGBuilder…
		(VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
		ShuffleVectorInst::isReverseMask(ShuffleMask)) {
		SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
		return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
		DAG.getConstant(8, dl, MVT::i32));
		}

bool ReverseEXT = false;		bool ReverseEXT = false;
unsigned Imm;		unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {		if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
if (ReverseEXT)		if (ReverseEXT)
std::swap(V1, V2);		std::swap(V1, V2);
Imm *= getExtFactor(V1);		Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,		return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));		DAG.getConstant(Imm, dl, MVT::i32));
▲ Show 20 Lines • Show All 8,547 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -verify-machineinstrs < %s \| FileCheck --check-prefix=CHECK --check-prefix=CHECK-SELDAG %s			; RUN: llc -verify-machineinstrs < %s \| FileCheck --check-prefix=CHECK --check-prefix=CHECK-SELDAG %s
	; RUN: llc -verify-machineinstrs -O0 < %s \| FileCheck --check-prefix=CHECK --check-prefix=CHECK-FASTISEL %s			; RUN: llc -verify-machineinstrs -O0 < %s \| FileCheck --check-prefix=CHECK --check-prefix=CHECK-FASTISEL %s

	target triple = "aarch64-unknown-linux-gnu"			target triple = "aarch64-unknown-linux-gnu"

	;			;
	; VECTOR_REVERSE			; VECTOR_REVERSE
	;			;

	define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {			define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
	; CHECK-LABEL: .LCPI0_0:
	; CHECK: .byte 15 // 0xf
	; CHECK-NEXT: .byte 14 // 0xe
	; CHECK-NEXT: .byte 13 // 0xd
	; CHECK-NEXT: .byte 12 // 0xc
	; CHECK-NEXT: .byte 11 // 0xb
	; CHECK-NEXT: .byte 10 // 0xa
	; CHECK-NEXT: .byte 9 // 0x9
	; CHECK-NEXT: .byte 8 // 0x8
	; CHECK-NEXT: .byte 7 // 0x7
	; CHECK-NEXT: .byte 6 // 0x6
	; CHECK-NEXT: .byte 5 // 0x5
	; CHECK-NEXT: .byte 4 // 0x4
	; CHECK-NEXT: .byte 3 // 0x3
	; CHECK-NEXT: .byte 2 // 0x2
	; CHECK-NEXT: .byte 1 // 0x1
	; CHECK-NEXT: .byte 0 // 0x0
	; CHECK-LABEL: reverse_v16i8:			; CHECK-LABEL: reverse_v16i8:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI0_0			; CHECK-NEXT: rev64 v0.16b, v0.16b
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	%res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)			%res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
	ret <16 x i8> %res			ret <16 x i8> %res
	}			}

	define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {			define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
	; CHECK-LABEL: .LCPI1_0:
	; CHECK: .byte 14 // 0xe
	; CHECK-NEXT: .byte 15 // 0xf
	; CHECK-NEXT: .byte 12 // 0xc
	; CHECK-NEXT: .byte 13 // 0xd
	; CHECK-NEXT: .byte 10 // 0xa
	; CHECK-NEXT: .byte 11 // 0xb
	; CHECK-NEXT: .byte 8 // 0x8
	; CHECK-NEXT: .byte 9 // 0x9
	; CHECK-NEXT: .byte 6 // 0x6
	; CHECK-NEXT: .byte 7 // 0x7
	; CHECK-NEXT: .byte 4 // 0x4
	; CHECK-NEXT: .byte 5 // 0x5
	; CHECK-NEXT: .byte 2 // 0x2
	; CHECK-NEXT: .byte 3 // 0x3
	; CHECK-NEXT: .byte 0 // 0x0
	; CHECK-NEXT: .byte 1 // 0x1
	; CHECK-LABEL: reverse_v8i16:			; CHECK-LABEL: reverse_v8i16:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI1_0			; CHECK-NEXT: rev64 v0.8h, v0.8h
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	%res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)			%res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
	ret <8 x i16> %res			ret <8 x i16> %res
	}			}

	define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {			define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
	; CHECK-LABEL: reverse_v4i32:			; CHECK-LABEL: reverse_v4i32:
	Show All 12 Lines
	; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	%res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)			%res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
	ret <2 x i64> %res			ret <2 x i64> %res
	}			}

	define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {			define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {
	; CHECK-LABEL: .LCPI4_0:
	; CHECK: .byte 14 // 0xe
	; CHECK-NEXT: .byte 15 // 0xf
	; CHECK-NEXT: .byte 12 // 0xc
	; CHECK-NEXT: .byte 13 // 0xd
	; CHECK-NEXT: .byte 10 // 0xa
	; CHECK-NEXT: .byte 11 // 0xb
	; CHECK-NEXT: .byte 8 // 0x8
	; CHECK-NEXT: .byte 9 // 0x9
	; CHECK-NEXT: .byte 6 // 0x6
	; CHECK-NEXT: .byte 7 // 0x7
	; CHECK-NEXT: .byte 4 // 0x4
	; CHECK-NEXT: .byte 5 // 0x5
	; CHECK-NEXT: .byte 2 // 0x2
	; CHECK-NEXT: .byte 3 // 0x3
	; CHECK-NEXT: .byte 0 // 0x0
	; CHECK-NEXT: .byte 1 // 0x1
	; CHECK-LABEL: reverse_v8f16:			; CHECK-LABEL: reverse_v8f16:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI4_0			; CHECK-NEXT: rev64 v0.8h, v0.8h
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	%res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)			%res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
	ret <8 x half> %res			ret <8 x half> %res
	}			}

	define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {			define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
	; CHECK-LABEL: reverse_v4f32:			; CHECK-LABEL: reverse_v4f32:
	▲ Show 20 Lines • Show All 108 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/neon-reverseshuffle.patch

	Show All 29 Lines
	entry:			entry:
	%V128 = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>			%V128 = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
	ret <2 x i32> %V128			ret <2 x i32> %V128
	}			}

	define <8 x i16> @v8i16(<8 x i16> %a) {			define <8 x i16> @v8i16(<8 x i16> %a) {
	; CHECK-LABEL: v8i16:			; CHECK-LABEL: v8i16:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: adrp x8, .LCPI3_0			; CHECK-NEXT: rev64 v0.8h, v0.8h
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%V128 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>			%V128 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <8 x i16> %V128			ret <8 x i16> %V128
	}			}

	define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {			define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {
	; CHECK-LABEL: v8i16_2:			; CHECK-LABEL: v8i16_2:
	Show All 15 Lines
	entry:			entry:
	%V128 = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>			%V128 = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	ret <4 x i16> %V128			ret <4 x i16> %V128
	}			}

	define <16 x i8> @v16i8(<16 x i8> %a) {			define <16 x i8> @v16i8(<16 x i8> %a) {
	; CHECK-LABEL: v16i8:			; CHECK-LABEL: v16i8:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: adrp x8, .LCPI6_0			; CHECK-NEXT: rev64 v0.16b, v0.16b
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%V128 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>			%V128 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <16 x i8> %V128			ret <16 x i8> %V128
	}			}

	define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {			define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {
	; CHECK-LABEL: v16i8_2:			; CHECK-LABEL: v16i8_2:
	▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines
	entry:			entry:
	%V128 = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>			%V128 = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
	ret <2 x float> %V128			ret <2 x float> %V128
	}			}

	define <8 x half> @v8f16(<8 x half> %a) {			define <8 x half> @v8f16(<8 x half> %a) {
	; CHECK-LABEL: v8f16:			; CHECK-LABEL: v8f16:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: adrp x8, .LCPI12_0			; CHECK-NEXT: rev64 v0.8h, v0.8h
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]			; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
	; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%V128 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>			%V128 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <8 x half> %V128			ret <8 x half> %V128
	}			}

	define <4 x half> @v4f16(<4 x half> %a) {			define <4 x half> @v4f16(<4 x half> %a) {
	; CHECK-LABEL: v4f16:			; CHECK-LABEL: v4f16:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: rev64 v0.4h, v0.4h			; CHECK-NEXT: rev64 v0.4h, v0.4h
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%V128 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>			%V128 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	ret <4 x half> %V128			ret <4 x half> %V128
	}			}