This is an archive of the discontinued LLVM Phabricator instance.

[ARM] MVE reverse shuffles.
ClosedPublic

Authored by dmgreen on Oct 28 2019, 6:06 AM.

Download Raw Diff

Details

Reviewers

t.p.northover
samparker
SjoerdMeijer
ostannard
simon_tatham

Commits

rG3f90df22f1b7: [ARM] MVE reverse shuffles.

Summary

The vectorizer can sometimes make reverse shuffles from indices that count down. In MVE, we don't have a 128bit rev instruction, but we can select this to a VREV64 with some lane movs to swap the two halfs.

Ideally this would use VMOVD's, but only gets as far as VMOVS's at the moment.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

dmgreen created this revision.Oct 28 2019, 6:06 AM

Herald added a project: Restricted Project. · View Herald TranscriptOct 28 2019, 6:06 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

Updated the cost model too.

samparker added inline comments.Oct 30 2019, 5:49 AM

llvm/lib/Target/ARM/ARMISelLowering.cpp
8330–8333	I think keeping some kind of assert is a good idea here.
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
642 ↗	(On Diff #226693)	should we be considering v4i16 and v8i8 too?

dmgreen marked 2 inline comments as done.Oct 30 2019, 11:49 AM

dmgreen added inline comments.

llvm/lib/Target/ARM/ARMISelLowering.cpp
8330–8333	Yeah, OK. Probably best. I had this going with v4i32 too, which is where this was lost. But until double moves are selected better, that doesn't improve things.
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
642 ↗	(On Diff #226693)	Those 2 will be type legalised to a v4i32 and v8i16, and the final cost will use the legal type (multiplied by the type legalisation cost, which looks like it will be 1 from the tests). Because we don't use vmovd's yet, I guess the cost of a v4i32 should be 4, not 5. I'll update that, but they will hopefully drop down to 3 in the future, like the rest.

dmgreen updated this revision to Diff 227153.Oct 30 2019, 12:21 PM

Ok, LGTM

This revision is now accepted and ready to land.Nov 13 2019, 3:01 AM

Closed by commit rG3f90df22f1b7: [ARM] MVE reverse shuffles. (authored by dmgreen). · Explain WhySep 20 2021, 5:48 AM

This revision was automatically updated to reflect the committed changes.

dmgreen added a commit: rG3f90df22f1b7: [ARM] MVE reverse shuffles..

Revision Contents

Path

Size

llvm/

lib/

Target/

ARM/

ARMISelLowering.cpp

31 lines

test/

CodeGen/

Thumb2/

mve-shuffle.ll

66 lines

mve-shufflemov.ll

66 lines

Diff 373562

llvm/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,225 Lines • ▼ Show 20 Lines	if (EltSize >= 32 \|\|
isVREVMask(M, VT, 32) \|\|		isVREVMask(M, VT, 32) \|\|
isVREVMask(M, VT, 16))		isVREVMask(M, VT, 16))
return true;		return true;
else if (Subtarget->hasNEON() &&		else if (Subtarget->hasNEON() &&
(isVEXTMask(M, VT, ReverseVEXT, Imm) \|\|		(isVEXTMask(M, VT, ReverseVEXT, Imm) \|\|
isVTBLMask(M, VT) \|\|		isVTBLMask(M, VT) \|\|
isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))		isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
return true;		return true;
else if (Subtarget->hasNEON() && (VT == MVT::v8i16 \|\| VT == MVT::v16i8) &&		else if ((VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16i8) &&
isReverseMask(M, VT))		isReverseMask(M, VT))
return true;		return true;
else if (Subtarget->hasMVEIntegerOps() &&		else if (Subtarget->hasMVEIntegerOps() &&
(isVMOVNMask(M, VT, true, false) \|\|		(isVMOVNMask(M, VT, true, false) \|\|
isVMOVNMask(M, VT, false, false) \|\| isVMOVNMask(M, VT, true, true)))		isVMOVNMask(M, VT, false, false) \|\| isVMOVNMask(M, VT, true, true)))
return true;		return true;
else		else
return false;		return false;
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
if (V2.getNode()->isUndef())		if (V2.getNode()->isUndef())
return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,		return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));		DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));

return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,		return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));		DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
}		}

static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,		static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
SDValue OpLHS = Op.getOperand(0);		EVT VT = Op.getValueType();
EVT VT = OpLHS.getValueType();

assert((VT == MVT::v8i16 \|\| VT == MVT::v16i8) &&		assert((VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16i8) &&
"Expect an v8i16/v16i8 type");		"Expect an v8i16/v16i8 type");
OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);		SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
		samparkerUnsubmitted Not Done Reply Inline Actions I think keeping some kind of assert is a good idea here. samparker: I think keeping some kind of assert is a good idea here.
		dmgreenAuthorUnsubmitted Done Reply Inline Actions Yeah, OK. Probably best. I had this going with v4i32 too, which is where this was lost. But until double moves are selected better, that doesn't improve things. dmgreen: Yeah, OK. Probably best. I had this going with v4i32 too, which is where this was lost. But…
// For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,		// For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
// extract the first 8 bytes into the top double word and the last 8 bytes		// extract the first 8 bytes into the top double word and the last 8 bytes
// into the bottom double word. The v8i16 case is similar.		// into the bottom double word, through a new vector shuffle that will be
unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;		// turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,		std::vector<int> NewMask;
DAG.getConstant(ExtractNum, DL, MVT::i32));		for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
		NewMask.push_back(VT.getVectorNumElements() / 2 + i);
		for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
		NewMask.push_back(i);
		return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
}		}

static EVT getVectorTyFromPredicateVector(EVT VT) {		static EVT getVectorTyFromPredicateVector(EVT VT) {
switch (VT.getSimpleVT().SimpleTy) {		switch (VT.getSimpleVT().SimpleTy) {
case MVT::v4i1:		case MVT::v4i1:
return MVT::v4i32;		return MVT::v4i32;
case MVT::v8i1:		case MVT::v8i1:
return MVT::v8i16;		return MVT::v8i16;
▲ Show 20 Lines • Show All 405 Lines • ▼ Show 20 Lines	for (unsigned i = 0; i < NumElts; ++i) {
ShuffleMask[i] < (int)NumElts ? V1 : V2,		ShuffleMask[i] < (int)NumElts ? V1 : V2,
DAG.getConstant(ShuffleMask[i] & (NumElts-1),		DAG.getConstant(ShuffleMask[i] & (NumElts-1),
dl, MVT::i32)));		dl, MVT::i32)));
}		}
SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);		SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
return DAG.getNode(ISD::BITCAST, dl, VT, Val);		return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}		}

if (ST->hasNEON() && (VT == MVT::v8i16 \|\| VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))		if ((VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16i8) &&
return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);		isReverseMask(ShuffleMask, VT))
		return LowerReverse_VECTOR_SHUFFLE(Op, DAG);

if (ST->hasNEON() && VT == MVT::v8i8)		if (ST->hasNEON() && VT == MVT::v8i8)
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))		if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;		return NewOp;

if (ST->hasMVEIntegerOps())		if (ST->hasMVEIntegerOps())
if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))		if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
return NewOp;		return NewOp;
▲ Show 20 Lines • Show All 12,336 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-shuffle.ll

Show First 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	entry:
ret <4 x i32> %r		ret <4 x i32> %r
}		}

; i16		; i16

define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {		define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
; CHECK-LABEL: shuffle1_i16:		; CHECK-LABEL: shuffle1_i16:
; CHECK: @ %bb.0: @ %entry		; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q1, q0		; CHECK-NEXT: vrev64.16 q1, q0
; CHECK-NEXT: vmovx.f16 s0, s7		; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmovx.f16 s1, s6		; CHECK-NEXT: vmov.f32 s1, s7
; CHECK-NEXT: vmovx.f16 s2, s5		; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmovx.f16 s3, s4		; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vins.f16 s0, s7
; CHECK-NEXT: vins.f16 s1, s6
; CHECK-NEXT: vins.f16 s2, s5
; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: bx lr		; CHECK-NEXT: bx lr
entry:		entry:
%out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>		%out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x i16> %out		ret <8 x i16> %out
}		}

define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) {		define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) {
; CHECK-LABEL: shuffle2_i16:		; CHECK-LABEL: shuffle2_i16:
▲ Show 20 Lines • Show All 253 Lines • ▼ Show 20 Lines	entry:
ret <8 x i16> %r		ret <8 x i16> %r
}		}

; i8		; i8

define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) {		define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) {
; CHECK-LABEL: shuffle1_i8:		; CHECK-LABEL: shuffle1_i8:
; CHECK: @ %bb.0: @ %entry		; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q1, q0		; CHECK-NEXT: vrev64.8 q1, q0
; CHECK-NEXT: vmov.u8 r0, q0[15]		; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmov.8 q0[0], r0		; CHECK-NEXT: vmov.f32 s1, s7
; CHECK-NEXT: vmov.u8 r0, q1[14]		; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.8 q0[1], r0		; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vmov.u8 r0, q1[13]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov.u8 r0, q1[12]
; CHECK-NEXT: vmov.8 q0[3], r0
; CHECK-NEXT: vmov.u8 r0, q1[11]
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmov.u8 r0, q1[10]
; CHECK-NEXT: vmov.8 q0[5], r0
; CHECK-NEXT: vmov.u8 r0, q1[9]
; CHECK-NEXT: vmov.8 q0[6], r0
; CHECK-NEXT: vmov.u8 r0, q1[8]
; CHECK-NEXT: vmov.8 q0[7], r0
; CHECK-NEXT: vmov.u8 r0, q1[7]
; CHECK-NEXT: vmov.8 q0[8], r0
; CHECK-NEXT: vmov.u8 r0, q1[6]
; CHECK-NEXT: vmov.8 q0[9], r0
; CHECK-NEXT: vmov.u8 r0, q1[5]
; CHECK-NEXT: vmov.8 q0[10], r0
; CHECK-NEXT: vmov.u8 r0, q1[4]
; CHECK-NEXT: vmov.8 q0[11], r0
; CHECK-NEXT: vmov.u8 r0, q1[3]
; CHECK-NEXT: vmov.8 q0[12], r0
; CHECK-NEXT: vmov.u8 r0, q1[2]
; CHECK-NEXT: vmov.8 q0[13], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.8 q0[14], r0
; CHECK-NEXT: vmov.u8 r0, q1[0]
; CHECK-NEXT: vmov.8 q0[15], r0
; CHECK-NEXT: bx lr		; CHECK-NEXT: bx lr
entry:		entry:
%out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>		%out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <16 x i8> %out		ret <16 x i8> %out
}		}

define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) {		define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) {
; CHECK-LABEL: shuffle2_i8:		; CHECK-LABEL: shuffle2_i8:
▲ Show 20 Lines • Show All 699 Lines • ▼ Show 20 Lines	entry:
ret <4 x float> %r		ret <4 x float> %r
}		}

; f16		; f16

define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {		define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
; CHECK-LABEL: shuffle1_f16:		; CHECK-LABEL: shuffle1_f16:
; CHECK: @ %bb.0: @ %entry		; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q1, q0		; CHECK-NEXT: vrev64.16 q1, q0
; CHECK-NEXT: vmovx.f16 s0, s7		; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: vmovx.f16 s1, s6		; CHECK-NEXT: vmov.f32 s1, s7
; CHECK-NEXT: vmovx.f16 s2, s5		; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmovx.f16 s3, s4		; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vins.f16 s0, s7
; CHECK-NEXT: vins.f16 s1, s6
; CHECK-NEXT: vins.f16 s2, s5
; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: bx lr		; CHECK-NEXT: bx lr
entry:		entry:
%out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>		%out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x half> %out		ret <8 x half> %out
}		}

define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) {		define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) {
; CHECK-LABEL: shuffle2_f16:		; CHECK-LABEL: shuffle2_f16:
▲ Show 20 Lines • Show All 501 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-shufflemov.ll

	Show All 29 Lines
	entry:			entry:
	%out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>			%out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
	ret <8 x i16> %out			ret <8 x i16> %out
	}			}

	define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) {			define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) {
	; CHECK-LABEL: shuffle_i16_76543210:			; CHECK-LABEL: shuffle_i16_76543210:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmov q1, q0			; CHECK-NEXT: vrev64.16 q1, q0
	; CHECK-NEXT: vmovx.f16 s0, s7			; CHECK-NEXT: vmov.f32 s0, s6
	; CHECK-NEXT: vmovx.f16 s1, s6			; CHECK-NEXT: vmov.f32 s1, s7
	; CHECK-NEXT: vmovx.f16 s2, s5			; CHECK-NEXT: vmov.f32 s2, s4
	; CHECK-NEXT: vmovx.f16 s3, s4			; CHECK-NEXT: vmov.f32 s3, s5
	; CHECK-NEXT: vins.f16 s0, s7
	; CHECK-NEXT: vins.f16 s1, s6
	; CHECK-NEXT: vins.f16 s2, s5
	; CHECK-NEXT: vins.f16 s3, s4
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>			%out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <8 x i16> %out			ret <8 x i16> %out
	}			}

	define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_01234567(<8 x i16> %s1, <8 x i16> %s2) {			define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_01234567(<8 x i16> %s1, <8 x i16> %s2) {
	; CHECK-LABEL: shuffle_i16_01234567:			; CHECK-LABEL: shuffle_i16_01234567:
	▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines
	entry:			entry:
	%out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> <i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9, i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>			%out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> <i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9, i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
	ret <16 x i8> %out			ret <16 x i8> %out
	}			}

	define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_fedcba9876543210(<16 x i8> %s1, <16 x i8> %s2) {			define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_fedcba9876543210(<16 x i8> %s1, <16 x i8> %s2) {
	; CHECK-LABEL: shuffle_i8_fedcba9876543210:			; CHECK-LABEL: shuffle_i8_fedcba9876543210:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmov q1, q0			; CHECK-NEXT: vrev64.8 q1, q0
	; CHECK-NEXT: vmov.u8 r0, q0[15]			; CHECK-NEXT: vmov.f32 s0, s6
	; CHECK-NEXT: vmov.8 q0[0], r0			; CHECK-NEXT: vmov.f32 s1, s7
	; CHECK-NEXT: vmov.u8 r0, q1[14]			; CHECK-NEXT: vmov.f32 s2, s4
	; CHECK-NEXT: vmov.8 q0[1], r0			; CHECK-NEXT: vmov.f32 s3, s5
	; CHECK-NEXT: vmov.u8 r0, q1[13]
	; CHECK-NEXT: vmov.8 q0[2], r0
	; CHECK-NEXT: vmov.u8 r0, q1[12]
	; CHECK-NEXT: vmov.8 q0[3], r0
	; CHECK-NEXT: vmov.u8 r0, q1[11]
	; CHECK-NEXT: vmov.8 q0[4], r0
	; CHECK-NEXT: vmov.u8 r0, q1[10]
	; CHECK-NEXT: vmov.8 q0[5], r0
	; CHECK-NEXT: vmov.u8 r0, q1[9]
	; CHECK-NEXT: vmov.8 q0[6], r0
	; CHECK-NEXT: vmov.u8 r0, q1[8]
	; CHECK-NEXT: vmov.8 q0[7], r0
	; CHECK-NEXT: vmov.u8 r0, q1[7]
	; CHECK-NEXT: vmov.8 q0[8], r0
	; CHECK-NEXT: vmov.u8 r0, q1[6]
	; CHECK-NEXT: vmov.8 q0[9], r0
	; CHECK-NEXT: vmov.u8 r0, q1[5]
	; CHECK-NEXT: vmov.8 q0[10], r0
	; CHECK-NEXT: vmov.u8 r0, q1[4]
	; CHECK-NEXT: vmov.8 q0[11], r0
	; CHECK-NEXT: vmov.u8 r0, q1[3]
	; CHECK-NEXT: vmov.8 q0[12], r0
	; CHECK-NEXT: vmov.u8 r0, q1[2]
	; CHECK-NEXT: vmov.8 q0[13], r0
	; CHECK-NEXT: vmov.u8 r0, q1[1]
	; CHECK-NEXT: vmov.8 q0[14], r0
	; CHECK-NEXT: vmov.u8 r0, q1[0]
	; CHECK-NEXT: vmov.8 q0[15], r0
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>			%out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <16 x i8> %out			ret <16 x i8> %out
	}			}

	define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_0123456789abcdef(<16 x i8> %s1, <16 x i8> %s2) {			define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_0123456789abcdef(<16 x i8> %s1, <16 x i8> %s2) {
	; CHECK-LABEL: shuffle_i8_0123456789abcdef:			; CHECK-LABEL: shuffle_i8_0123456789abcdef:
	▲ Show 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
	entry:			entry:
	%out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>			%out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
	ret <8 x half> %out			ret <8 x half> %out
	}			}

	define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x half> %s2) {			define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x half> %s2) {
	; CHECK-LABEL: shuffle_f16_76543210:			; CHECK-LABEL: shuffle_f16_76543210:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmov q1, q0			; CHECK-NEXT: vrev64.16 q1, q0
	; CHECK-NEXT: vmovx.f16 s0, s7			; CHECK-NEXT: vmov.f32 s0, s6
	; CHECK-NEXT: vmovx.f16 s1, s6			; CHECK-NEXT: vmov.f32 s1, s7
	; CHECK-NEXT: vmovx.f16 s2, s5			; CHECK-NEXT: vmov.f32 s2, s4
	; CHECK-NEXT: vmovx.f16 s3, s4			; CHECK-NEXT: vmov.f32 s3, s5
	; CHECK-NEXT: vins.f16 s0, s7
	; CHECK-NEXT: vins.f16 s1, s6
	; CHECK-NEXT: vins.f16 s2, s5
	; CHECK-NEXT: vins.f16 s3, s4
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>			%out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <8 x half> %out			ret <8 x half> %out
	}			}

	define arm_aapcs_vfpcc <8 x half> @shuffle_f16_01234567(<8 x half> %s1, <8 x half> %s2) {			define arm_aapcs_vfpcc <8 x half> @shuffle_f16_01234567(<8 x half> %s1, <8 x half> %s2) {
	; CHECK-LABEL: shuffle_f16_01234567:			; CHECK-LABEL: shuffle_f16_01234567:
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines