This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Fix ReconstructShuffle for bigendian
ClosedPublic

Authored by dmgreen on Feb 12 2020, 6:27 AM.

Download Raw Diff

Details

Reviewers

simon_tatham
samparker
SjoerdMeijer
ostannard

Commits

rG9d4c59754110: [ARM] Fix ReconstructShuffle for bigendian

Summary

Simon pointed out that this function is doing a bitcast, which can be incorrect for big endian. This makes the lowering of VMOVN in MVE incorrect, but the function is shared between Neon and MVE so both can be incorrect.

This attempts to fix things by using the newly added VECTOR_REG_CAST instead of the BITCAST. As it may now be used on Neon, I've added the relevant patterns for it there too. I've also added a quick dag combine for it, to remove them where possible.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

dmgreen created this revision.Feb 12 2020, 6:27 AM

Herald added a project: Restricted Project. · View Herald TranscriptFeb 12 2020, 6:27 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

LGTM, with a not-very-important comment nitpick.

llvm/lib/Target/ARM/ARMInstrInfo.td
307	This reference to MVE-specific instruction names might be out of place now this comment is shared with NEON :-) But I don't know enough NEON to be sure of what the analogous load/store instructions there look like. Are those VST1 / VLD1, perhaps?

This revision is now accepted and ready to land.Feb 12 2020, 6:41 AM

Closed by commit rG9d4c59754110: [ARM] Fix ReconstructShuffle for bigendian (authored by dmgreen). · Explain WhyFeb 13 2020, 2:03 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

ARM/

28 lines

15 lines

17 lines

10 lines

test/

CodeGen/

ARM/

neon-vmovn.ll

4 lines

Thumb2/

mve-vmovn.ll

16 lines

Diff 244364

llvm/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,520 Lines • ▼ Show 20 Lines	SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
// Another possible incompatibility occurs from the vector element types. We		// Another possible incompatibility occurs from the vector element types. We
// can fix this by bitcasting the source vectors to the same type we intend		// can fix this by bitcasting the source vectors to the same type we intend
// for the shuffle.		// for the shuffle.
for (auto &Src : Sources) {		for (auto &Src : Sources) {
EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();		EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
if (SrcEltTy == SmallestEltTy)		if (SrcEltTy == SmallestEltTy)
continue;		continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);		assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);		Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();		Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;		Src.WindowBase *= Src.WindowScale;
}		}

// Final sanity check before we try to actually produce a shuffle.		// Final sanity check before we try to actually produce a shuffle.
LLVM_DEBUG(for (auto Src		LLVM_DEBUG(for (auto Src
: Sources)		: Sources)
assert(Src.ShuffleVec.getValueType() == ShuffleVT););		assert(Src.ShuffleVec.getValueType() == ShuffleVT););
Show All 35 Lines	SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };		SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
for (unsigned i = 0; i < Sources.size(); ++i)		for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;		ShuffleOps[i] = Sources[i].ShuffleVec;

SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],		SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
ShuffleOps[1], Mask, DAG);		ShuffleOps[1], Mask, DAG);
if (!Shuffle)		if (!Shuffle)
return SDValue();		return SDValue();
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);		return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
}		}

enum ShuffleOpCodes {		enum ShuffleOpCodes {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>		OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VREV,		OP_VREV,
OP_VDUP0,		OP_VDUP0,
OP_VDUP1,		OP_VDUP1,
OP_VDUP2,		OP_VDUP2,
▲ Show 20 Lines • Show All 5,357 Lines • ▼ Show 20 Lines	if (Op->getOperand(0).getValueType() == VT)
return Op->getOperand(0);		return Op->getOperand(0);
return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,		return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
Op->getOperand(0).getValueType(), Op->getOperand(0));		Op->getOperand(0).getValueType(), Op->getOperand(0));
}		}

return SDValue();		return SDValue();
}		}

		static SDValue
		PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
		const ARMSubtarget *ST) {
		EVT VT = N->getValueType(0);
		SDValue Op = N->getOperand(0);
		SDLoc dl(N);

		// Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
		if (ST->isLittle())
		return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);

		// VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
		if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
		// If the valuetypes are the same, we can remove the cast entirely.
		if (Op->getOperand(0).getValueType() == VT)
		return Op->getOperand(0);
		return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
		}

		return SDValue();
		}

static SDValue PerformVCMPCombine(SDNode *N,		static SDValue PerformVCMPCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {		const ARMSubtarget *Subtarget) {
if (!Subtarget->hasMVEIntegerOps())		if (!Subtarget->hasMVEIntegerOps())
return SDValue();		return SDValue();

EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);		SDValue Op0 = N->getOperand(0);
▲ Show 20 Lines • Show All 1,819 Lines • ▼ Show 20 Lines	SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ARMISD::VLD2DUP:		case ARMISD::VLD2DUP:
case ARMISD::VLD3DUP:		case ARMISD::VLD3DUP:
case ARMISD::VLD4DUP:		case ARMISD::VLD4DUP:
return PerformVLDCombine(N, DCI);		return PerformVLDCombine(N, DCI);
case ARMISD::BUILD_VECTOR:		case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);		return PerformARMBUILD_VECTORCombine(N, DCI);
case ARMISD::PREDICATE_CAST:		case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);		return PerformPREDICATE_CASTCombine(N, DCI);
		case ARMISD::VECTOR_REG_CAST:
		return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
case ARMISD::VCMP:		case ARMISD::VCMP:
return PerformVCMPCombine(N, DCI, Subtarget);		return PerformVCMPCombine(N, DCI, Subtarget);
case ARMISD::SMULWB: {		case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();		unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);		APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))		if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
return SDValue();		return SDValue();
break;		break;
▲ Show 20 Lines • Show All 2,828 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrInfo.td

	Show First 20 Lines • Show All 288 Lines • ▼ Show 20 Lines

	def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>;			def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
	def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;			def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;

	def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;			def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
	def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;			def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
	def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;			def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;

				// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
				// vector register as a different vector type, without changing the contents of
				// the register. It differs from 'bitconvert' in that bitconvert reinterprets
				// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
				// reinterprets the _register_ format - and in big-endian, the memory and
				// register formats are different, so they are different operations.
				//
				// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
				// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
				// whereas 'bitconvert' will map it to the high byte in big-endian mode,
				// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
				simon_tathamUnsubmitted Not Done Reply Inline Actions This reference to MVE-specific instruction names might be out of place now this comment is shared with NEON :-) But I don't know enough NEON to be sure of what the analogous load/store instructions there look like. Are those VST1 / VLD1, perhaps? simon_tatham: This reference to MVE-specific instruction names might be out of place now this comment is…
				// bitconvert would have to emit a VREV16.8 instruction, whereas the
				// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
				def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// ARM Flag Definitions.			// ARM Flag Definitions.

	class RegConstraint<string C> {			class RegConstraint<string C> {
	string Constraints = C;			string Constraints = C;
	}			}

	// ARMCC condition codes. See ARMCC::CondCodes			// ARMCC condition codes. See ARMCC::CondCodes
	▲ Show 20 Lines • Show All 5,950 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrMVE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 3,984 Lines • ▼ Show 20 Lines
	}			}

	// Occasionally we need to cast between a i32 and a boolean vector, for			// Occasionally we need to cast between a i32 and a boolean vector, for
	// example when moving between rGPR and VPR.P0 as part of predicate vector			// example when moving between rGPR and VPR.P0 as part of predicate vector
	// shuffles. We also sometimes need to cast between different predicate			// shuffles. We also sometimes need to cast between different predicate
	// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.			// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
	def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;			def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;

	// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
	// vector register as a different vector type, without changing the contents of
	// the register. It differs from 'bitconvert' in that bitconvert reinterprets
	// the _memory_ storage format of the vector, whereas vector_reg_cast
	// reinterprets the _register_ format - and in big-endian, the memory and
	// register formats are different, so they are different operations.
	//
	// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
	// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
	// whereas 'bitconvert' will map it to the high byte in big-endian mode,
	// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
	// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
	// no code at all if the vector is already in a register.
	def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;

	let Predicates = [HasMVEInt] in {			let Predicates = [HasMVEInt] in {
	foreach VT = [ v4i1, v8i1, v16i1 ] in {			foreach VT = [ v4i1, v8i1, v16i1 ] in {
	def : Pat<(i32 (predicate_cast (VT VCCR:$src))),			def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
	(i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;			(i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
	def : Pat<(VT (predicate_cast (i32 VCCR:$src))),			def : Pat<(VT (predicate_cast (i32 VCCR:$src))),
	(VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;			(VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;

	foreach VT2 = [ v4i1, v8i1, v16i1 ] in			foreach VT2 = [ v4i1, v8i1, v16i1 ] in
	def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),			def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),
	(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;			(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
	}			}

	foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in			foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
	foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in			foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
	def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;			def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
	}			}

	// end of MVE compares			// end of MVE compares

	// start of MVE_qDest_qSrc			// start of MVE_qDest_qSrc

	class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,			class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
	string ops, vpred_ops vpred, string cstr,			string ops, vpred_ops vpred, string cstr,
	▲ Show 20 Lines • Show All 2,472 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrNEON.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,525 Lines • ▼ Show 20 Lines	let Predicates = [IsBE,HasNEON] in {
def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;		def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>;		def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;		def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;		def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>;		def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;		def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
}		}

		let Predicates = [HasNEON] in {
		foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
		foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
		def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;

		foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
		foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
		def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
		}

// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian		// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
let Predicates = [IsBE,HasNEON] in {		let Predicates = [IsBE,HasNEON] in {
def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),		def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
(VREV64q8 (VLD1q8 addrmode6:$addr))>;		(VREV64q8 (VLD1q8 addrmode6:$addr))>;
def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),		def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
(VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>;		(VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>;
def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),		def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
(VREV64q16 (VLD1q16 addrmode6:$addr))>;		(VREV64q16 (VLD1q16 addrmode6:$addr))>;
▲ Show 20 Lines • Show All 1,371 Lines • Show Last 20 Lines

llvm/test/CodeGen/ARM/neon-vmovn.ll

	Show First 20 Lines • Show All 726 Lines • ▼ Show 20 Lines
	define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) {			define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) {
	; CHECK-LABEL: test:			; CHECK-LABEL: test:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vtrn.8 q0, q1			; CHECK-NEXT: vtrn.8 q0, q1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	;			;
	; CHECKBE-LABEL: test:			; CHECKBE-LABEL: test:
	; CHECKBE: @ %bb.0: @ %entry			; CHECKBE: @ %bb.0: @ %entry
	; CHECKBE-NEXT: vrev64.8 q8, q1			; CHECKBE-NEXT: vrev64.16 q8, q1
	; CHECKBE-NEXT: vrev64.8 q9, q0			; CHECKBE-NEXT: vrev64.16 q9, q0
	; CHECKBE-NEXT: vtrn.8 q9, q8			; CHECKBE-NEXT: vtrn.8 q9, q8
	; CHECKBE-NEXT: vrev64.8 q0, q9			; CHECKBE-NEXT: vrev64.8 q0, q9
	; CHECKBE-NEXT: bx lr			; CHECKBE-NEXT: bx lr
	entry:			entry:
	%a0 = extractelement <8 x i16> %src1, i32 0			%a0 = extractelement <8 x i16> %src1, i32 0
	%a1 = extractelement <8 x i16> %src1, i32 1			%a1 = extractelement <8 x i16> %src1, i32 1
	%a2 = extractelement <8 x i16> %src1, i32 2			%a2 = extractelement <8 x i16> %src1, i32 2
	%a3 = extractelement <8 x i16> %src1, i32 3			%a3 = extractelement <8 x i16> %src1, i32 3
	▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-vmovn.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - \| FileCheck %s			; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - \| FileCheck %s
	; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - \| FileCheck %s --check-prefix=CHECKBE			; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - \| FileCheck %s --check-prefix=CHECKBE

	define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2) {			define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2) {
	; CHECK-LABEL: vmovn32_trunc1:			; CHECK-LABEL: vmovn32_trunc1:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmovnt.i32 q0, q1			; CHECK-NEXT: vmovnt.i32 q0, q1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	;			;
	; CHECKBE-LABEL: vmovn32_trunc1:			; CHECKBE-LABEL: vmovn32_trunc1:
	; CHECKBE: @ %bb.0: @ %entry			; CHECKBE: @ %bb.0: @ %entry
	; CHECKBE-NEXT: vrev64.16 q2, q1			; CHECKBE-NEXT: vrev64.32 q2, q1
	; CHECKBE-NEXT: vrev64.16 q1, q0			; CHECKBE-NEXT: vrev64.32 q1, q0
	; CHECKBE-NEXT: vmovnt.i32 q1, q2			; CHECKBE-NEXT: vmovnt.i32 q1, q2
	; CHECKBE-NEXT: vrev64.16 q0, q1			; CHECKBE-NEXT: vrev64.16 q0, q1
	; CHECKBE-NEXT: bx lr			; CHECKBE-NEXT: bx lr
	entry:			entry:
	%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>			%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
	%out = trunc <8 x i32> %strided.vec to <8 x i16>			%out = trunc <8 x i32> %strided.vec to <8 x i16>
	ret <8 x i16> %out			ret <8 x i16> %out
	}			}

	define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2) {			define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2) {
	; CHECK-LABEL: vmovn32_trunc2:			; CHECK-LABEL: vmovn32_trunc2:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmovnt.i32 q1, q0			; CHECK-NEXT: vmovnt.i32 q1, q0
	; CHECK-NEXT: vmov q0, q1			; CHECK-NEXT: vmov q0, q1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	;			;
	; CHECKBE-LABEL: vmovn32_trunc2:			; CHECKBE-LABEL: vmovn32_trunc2:
	; CHECKBE: @ %bb.0: @ %entry			; CHECKBE: @ %bb.0: @ %entry
	; CHECKBE-NEXT: vrev64.16 q2, q0			; CHECKBE-NEXT: vrev64.32 q2, q0
	; CHECKBE-NEXT: vrev64.16 q3, q1			; CHECKBE-NEXT: vrev64.32 q3, q1
	; CHECKBE-NEXT: vmovnt.i32 q3, q2			; CHECKBE-NEXT: vmovnt.i32 q3, q2
	; CHECKBE-NEXT: vrev64.16 q0, q3			; CHECKBE-NEXT: vrev64.16 q0, q3
	; CHECKBE-NEXT: bx lr			; CHECKBE-NEXT: bx lr
	entry:			entry:
	%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>			%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
	%out = trunc <8 x i32> %strided.vec to <8 x i16>			%out = trunc <8 x i32> %strided.vec to <8 x i16>
	ret <8 x i16> %out			ret <8 x i16> %out
	}			}

	define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2) {			define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2) {
	; CHECK-LABEL: vmovn16_trunc1:			; CHECK-LABEL: vmovn16_trunc1:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmovnt.i16 q0, q1			; CHECK-NEXT: vmovnt.i16 q0, q1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	;			;
	; CHECKBE-LABEL: vmovn16_trunc1:			; CHECKBE-LABEL: vmovn16_trunc1:
	; CHECKBE: @ %bb.0: @ %entry			; CHECKBE: @ %bb.0: @ %entry
	; CHECKBE-NEXT: vrev64.8 q2, q1			; CHECKBE-NEXT: vrev64.16 q2, q1
	; CHECKBE-NEXT: vrev64.8 q1, q0			; CHECKBE-NEXT: vrev64.16 q1, q0
	; CHECKBE-NEXT: vmovnt.i16 q1, q2			; CHECKBE-NEXT: vmovnt.i16 q1, q2
	; CHECKBE-NEXT: vrev64.8 q0, q1			; CHECKBE-NEXT: vrev64.8 q0, q1
	; CHECKBE-NEXT: bx lr			; CHECKBE-NEXT: bx lr
	entry:			entry:
	%strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>			%strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
	%out = trunc <16 x i16> %strided.vec to <16 x i8>			%out = trunc <16 x i16> %strided.vec to <16 x i8>
	ret <16 x i8> %out			ret <16 x i8> %out
	}			}

	define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2) {			define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2) {
	; CHECK-LABEL: vmovn16_trunc2:			; CHECK-LABEL: vmovn16_trunc2:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: vmovnt.i16 q1, q0			; CHECK-NEXT: vmovnt.i16 q1, q0
	; CHECK-NEXT: vmov q0, q1			; CHECK-NEXT: vmov q0, q1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	;			;
	; CHECKBE-LABEL: vmovn16_trunc2:			; CHECKBE-LABEL: vmovn16_trunc2:
	; CHECKBE: @ %bb.0: @ %entry			; CHECKBE: @ %bb.0: @ %entry
	; CHECKBE-NEXT: vrev64.8 q2, q0			; CHECKBE-NEXT: vrev64.16 q2, q0
	; CHECKBE-NEXT: vrev64.8 q3, q1			; CHECKBE-NEXT: vrev64.16 q3, q1
	; CHECKBE-NEXT: vmovnt.i16 q3, q2			; CHECKBE-NEXT: vmovnt.i16 q3, q2
	; CHECKBE-NEXT: vrev64.8 q0, q3			; CHECKBE-NEXT: vrev64.8 q0, q3
	; CHECKBE-NEXT: bx lr			; CHECKBE-NEXT: bx lr
	entry:			entry:
	%strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>			%strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
	%out = trunc <16 x i16> %strided.vec to <16 x i8>			%out = trunc <16 x i16> %strided.vec to <16 x i8>
	ret <16 x i8> %out			ret <16 x i8> %out
	}			}
	▲ Show 20 Lines • Show All 663 Lines • Show Last 20 Lines