This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/ARM/
-
Target/
-
ARM/
-
ARMISelLowering.h
-
ARMISelLowering.cpp
-
test/CodeGen/ARM/
-
CodeGen/
-
ARM/
2
vext.ll
-
vpadd.ll
-
vuzp.ll
-
vzip.ll

Differential D27774

[ARM] Implement isExtractSubvectorCheap
ClosedPublic

Authored by efriedma on Dec 14 2016, 2:01 PM.

Download Raw Diff

Details

Reviewers

rengolin
t.p.northover
jmolloy

Commits

rGd03df8145f3b: [ARM] Implement isExtractSubvectorCheap.
rL290198: [ARM] Implement isExtractSubvectorCheap.

Summary

See https://reviews.llvm.org/D6678 for the history of isExtractSubvectorCheap. Essentially the same considerations apply to ARM.

This temporarily breaks the formation of vpadd/vpaddl in certain cases; AddCombineToVPADDL essentially assumes that we won't form VUZP shuffles. This is mostly orthogonal, though, so I'll fix it in a followup.

Diff Detail

Repository: rL LLVM

Event Timeline

efriedma updated this revision to Diff 81461.Dec 14 2016, 2:01 PM

efriedma retitled this revision from to [ARM] Implement isExtractSubvectorCheap.

efriedma updated this object.

efriedma added reviewers: rengolin, t.p.northover, jmolloy.

efriedma set the repository for this revision to rL LLVM.

efriedma added subscribers: mkuper, llvm-commits.

Herald added a subscriber: aemerson. · View Herald TranscriptDec 14 2016, 2:01 PM

efriedma added a child revision: D27779: [ARM] More aggressive matching for vpadd and vpaddl..Dec 14 2016, 3:31 PM

Ping.

Hi Eli,

Just making sure the vorr can't move past the vld1s, LGTM. Thanks!

Looking forward to the follow up to fix vpadd.

test/CodeGen/ARM/vext.ll
144	Currently, the `vmov.u16` gets moved in between the `vld1`s, so I worry that the compiler might try to do that again with the `vorr` and the `CHECK-NEXT` could fail.

This revision is now accepted and ready to land.Dec 20 2016, 3:38 AM

efriedma added inline comments.Dec 20 2016, 11:16 AM

test/CodeGen/ARM/vext.ll
144	I think I'll just generate the checks using update_llc_test_checks.py to make it easy to regenerate if a scheduler change makes it move for some reason. (I think we really need exhaustive checks for shuffles; it can be hard to spot regressions otherwise.)

Closed by commit rL290198: [ARM] Implement isExtractSubvectorCheap. (authored by efriedma). · Explain WhyDec 20 2016, 12:15 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

ARM/

ARMISelLowering.h

4 lines

ARMISelLowering.cpp

8 lines

test/

CodeGen/

ARM/

32 lines

30 lines

29 lines

7 lines

Diff 81461

lib/Target/ARM/ARMISelLowering.h

Show First 20 Lines • Show All 425 Lines • ▼ Show 20 Lines	bool getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,		const CallInst &I,
unsigned Intrinsic) const override;		unsigned Intrinsic) const override;

/// \brief Returns true if it is beneficial to convert a load of a constant		/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.		/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,		bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;		Type *Ty) const override;

		/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
		/// with this index.
		bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;

/// \brief Returns true if an argument of type Ty needs to be passed in a		/// \brief Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.		/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(		bool functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;		Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;

/// If a physical register, this returns the register that receives the		/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.		/// exception address on entry to an EH pad.
unsigned		unsigned
▲ Show 20 Lines • Show All 272 Lines • Show Last 20 Lines

lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,893 Lines • ▼ Show 20 Lines	bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
assert(Ty->isIntegerTy());		assert(Ty->isIntegerTy());

unsigned Bits = Ty->getPrimitiveSizeInBits();		unsigned Bits = Ty->getPrimitiveSizeInBits();
if (Bits == 0 \|\| Bits > 32)		if (Bits == 0 \|\| Bits > 32)
return false;		return false;
return true;		return true;
}		}

		bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
		unsigned Index) const {
		if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
		return false;

		return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
		}

Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,		Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
ARM_MB::MemBOpt Domain) const {		ARM_MB::MemBOpt Domain) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();		Module *M = Builder.GetInsertBlock()->getParent()->getParent();

// First, if the target has no DMB, see what fallback we can use.		// First, if the target has no DMB, see what fallback we can use.
if (!Subtarget->hasDataBarrier()) {		if (!Subtarget->hasDataBarrier()) {
// Some ARMv6 cpus can support data barriers with an mcr instruction.		// Some ARMv6 cpus can support data barriers with an mcr instruction.
// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get		// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
▲ Show 20 Lines • Show All 565 Lines • Show Last 20 Lines

test/CodeGen/ARM/vext.ll

	Show First 20 Lines • Show All 128 Lines • ▼ Show 20 Lines
	;CHECK-LABEL: test_undef:			;CHECK-LABEL: test_undef:
	;CHECK: vzip.16			;CHECK: vzip.16
	%tmp1 = load <8 x i16>, <8 x i16>* %A			%tmp1 = load <8 x i16>, <8 x i16>* %A
	%tmp2 = load <8 x i16>, <8 x i16>* %B			%tmp2 = load <8 x i16>, <8 x i16>* %B
	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>			%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
	ret <4 x i16> %tmp3			ret <4 x i16> %tmp3
	}			}

	; We should ignore a build_vector with more than two sources.			; FIXME: Lower this more efficiently. (Given an arbitrary <32 x i16>, I think
	; Use illegal <32 x i16> type to produce such a shuffle after legalizing types.			; the most efficient lowering is three vext shuffles.)
	; Try to look for fallback to by-element inserts.
	define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {			define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
	;CHECK-LABEL: test_multisource:			;CHECK-LABEL: test_multisource:
	;CHECK: vmov.16 [[REG:d[0-9]+]][0]			;CHECK: vld1.16
	;CHECK: vmov.16 [[REG]][1]			;CHECK-NEXT: vld1.64
	;CHECK: vmov.16 [[REG]][2]			;CHECK-NEXT: vld1.64
	;CHECK: vmov.16 [[REG]][3]			;CHECK-NEXT: vld1.64
				rengolinUnsubmitted Not Done Reply Inline Actions Currently, the `vmov.u16` gets moved in between the `vld1`s, so I worry that the compiler might try to do that again with the `vorr` and the `CHECK-NEXT` could fail. rengolin: Currently, the `vmov.u16` gets moved in between the `vld1`s, so I worry that the compiler might…
				efriedmaAuthorUnsubmitted Not Done Reply Inline Actions I think I'll just generate the checks using update_llc_test_checks.py to make it easy to regenerate if a scheduler change makes it move for some reason. (I think we really need exhaustive checks for shuffles; it can be hard to spot regressions otherwise.) efriedma: I think I'll just generate the checks using update_llc_test_checks.py to make it easy to…
				;CHECK-NEXT: vorr
				;CHECK-NEXT: vzip.16
				;CHECK-NEXT: vext.16
				;CHECK-NEXT: vtrn.16
				;CHECK-NEXT: vext.16
				;CHECK-NEXT: vext.16
				;CHECK-NEXT: vmov r0, r1
	%tmp1 = load <32 x i16>, <32 x i16>* %B			%tmp1 = load <32 x i16>, <32 x i16>* %B
	%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>			%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
	ret <4 x i16> %tmp2			ret <4 x i16> %tmp2
	}			}

	; We don't handle shuffles using more than half of a 128-bit vector.			; If we split the operand into two <4 x i16> vectors, this becomes
	; Again, test for fallback to by-element inserts.			; a vuzp.
	define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {			define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
	;CHECK-LABEL: test_largespan:			;CHECK-LABEL: test_largespan:
	;CHECK: vmov.16 [[REG:d[0-9]+]][0]			;CHECK: vld1.64
	;CHECK: vmov.16 [[REG]][1]			;CHECK-NEXT: vorr
	;CHECK: vmov.16 [[REG]][2]			;CHECK-NEXT: vuzp.16
	;CHECK: vmov.16 [[REG]][3]			;CHECK-NEXT: vmov r0, r1
	%tmp1 = load <8 x i16>, <8 x i16>* %B			%tmp1 = load <8 x i16>, <8 x i16>* %B
	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>			%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
	ret <4 x i16> %tmp2			ret <4 x i16> %tmp2
	}			}

	; The actual shuffle code only handles some cases, make sure we check			; The actual shuffle code only handles some cases, make sure we check
	; this rather than blindly emitting a VECTOR_SHUFFLE (infinite			; this rather than blindly emitting a VECTOR_SHUFFLE (infinite
	; lowering loop can result otherwise).			; lowering loop can result otherwise).
	▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

test/CodeGen/ARM/vpadd.ll

	Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines
	define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {			define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
	;CHECK-LABEL: vpaddlQu32:			;CHECK-LABEL: vpaddlQu32:
	;CHECK: vpaddl.u32			;CHECK: vpaddl.u32
	%tmp1 = load <4 x i32>, <4 x i32>* %A			%tmp1 = load <4 x i32>, <4 x i32>* %A
	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)			%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
	ret <2 x i64> %tmp2			ret <2 x i64> %tmp2
	}			}

	; Test AddCombine optimization that generates a vpaddl.s			; Combine vuzp+vadd->vpadd.
	define void @addCombineToVPADDL() nounwind ssp {			; FIXME: Implement this optimization
	; CHECK: vpaddl.s8			define void @addCombineToVPADD(<16 x i8> %cbcr, <8 x i8> %X) nounwind ssp {
	%cbcr = alloca <16 x i8>, align 16			; CHECK-LABEL: addCombineToVPADD:
	%X = alloca <8 x i8>, align 8			; CHECK: vuzp.8
				; CHECK: vadd.i8
	%tmp = load <16 x i8>, <16 x i8>* %cbcr			%tmp = load <16 x i8>, <16 x i8>* %cbcr
	%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>			%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
	%tmp2 = load <16 x i8>, <16 x i8>* %cbcr			%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
	%add = add <8 x i8> %tmp3, %tmp1			%add = add <8 x i8> %tmp3, %tmp1
	store <8 x i8> %add, <8 x i8>* %X, align 8			store <8 x i8> %add, <8 x i8>* %X, align 8
	ret void			ret void
	}			}

				; Combine vuzp+vaddl->vpaddl
				; FIXME: Implement this optimization.
				define void @addCombineToVPADDL_sext(<16 x i8> %cbcr, <8 x i16> %X) nounwind ssp {
				; CHECK-LABEL: addCombineToVPADDL_sext:
				; CHECK: vuzp.8
				; CHECK: vaddl.s8
				%tmp = load <16 x i8>, <16 x i8>* %cbcr
				%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
				%tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
				%add = add <8 x i16> %tmp4, %tmp5
				store <8 x i16> %add, <8 x i16>* %X, align 8
				ret void
				}

	; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from			; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
	; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.			; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
	define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {			define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
	;CHECK-LABEL: fromExtendingExtractVectorElt:			;CHECK-LABEL: fromExtendingExtractVectorElt:
	;CHECK: vpaddl.s16			;CHECK: vpaddl.s16
	%tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>			%tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
	%tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>			%tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
	%x = add <2 x i16> %tmp2, %tmp1			%x = add <2 x i16> %tmp2, %tmp1
	Show All 18 Lines

test/CodeGen/ARM/vuzp.ll

	; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \| FileCheck %s			; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \| FileCheck %s

	define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	; CHECK-LABEL: vuzpi8:			; CHECK-LABEL: vuzpi8:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vldr d16, [r1]			; CHECK-NEXT: vldr d16, [r1]
	; CHECK-NEXT: vldr d17, [r0]			; CHECK-NEXT: vldr d17, [r0]
	; CHECK-NEXT: vuzp.8 d17, d16			; CHECK-NEXT: vuzp.8 d17, d16
	; CHECK-NEXT: vadd.i8 d16, d17, d16			; CHECK-NEXT: vmul.i8 d16, d17, d16
	; CHECK-NEXT: vmov r0, r1, d16			; CHECK-NEXT: vmov r0, r1, d16
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>			%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>			%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
	%tmp5 = add <8 x i8> %tmp3, %tmp4			%tmp5 = mul <8 x i8> %tmp3, %tmp4
	ret <8 x i8> %tmp5			ret <8 x i8> %tmp5
	}			}

	define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	; CHECK-LABEL: vuzpi8_Qres:			; CHECK-LABEL: vuzpi8_Qres:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]			; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
	; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]			; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
	; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]]			; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]]
	; CHECK-NEXT: vmov r0, r1, [[LDR0]]			; CHECK-NEXT: vmov r0, r1, [[LDR0]]
	; CHECK-NEXT: vmov r2, r3, [[LDR1]]			; CHECK-NEXT: vmov r2, r3, [[LDR1]]
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>			%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
	ret <16 x i8> %tmp3			ret <16 x i8> %tmp3
	}			}

	define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {			define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
	; CHECK-LABEL: vuzpi16:			; CHECK-LABEL: vuzpi16:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vldr d16, [r1]			; CHECK-NEXT: vldr d16, [r1]
	; CHECK-NEXT: vldr d17, [r0]			; CHECK-NEXT: vldr d17, [r0]
	; CHECK-NEXT: vuzp.16 d17, d16			; CHECK-NEXT: vuzp.16 d17, d16
	; CHECK-NEXT: vadd.i16 d16, d17, d16			; CHECK-NEXT: vmul.i16 d16, d17, d16
	; CHECK-NEXT: vmov r0, r1, d16			; CHECK-NEXT: vmov r0, r1, d16
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <4 x i16>, <4 x i16>* %A			%tmp1 = load <4 x i16>, <4 x i16>* %A
	%tmp2 = load <4 x i16>, <4 x i16>* %B			%tmp2 = load <4 x i16>, <4 x i16>* %B
	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>			%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>			%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
	%tmp5 = add <4 x i16> %tmp3, %tmp4			%tmp5 = mul <4 x i16> %tmp3, %tmp4
	ret <4 x i16> %tmp5			ret <4 x i16> %tmp5
	}			}

	define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {			define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
	; CHECK-LABEL: vuzpi16_Qres:			; CHECK-LABEL: vuzpi16_Qres:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]			; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
	; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]			; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
	▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
	; Undef shuffle indices should not prevent matching to VUZP:			; Undef shuffle indices should not prevent matching to VUZP:

	define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	; CHECK-LABEL: vuzpi8_undef:			; CHECK-LABEL: vuzpi8_undef:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vldr d16, [r1]			; CHECK-NEXT: vldr d16, [r1]
	; CHECK-NEXT: vldr d17, [r0]			; CHECK-NEXT: vldr d17, [r0]
	; CHECK-NEXT: vuzp.8 d17, d16			; CHECK-NEXT: vuzp.8 d17, d16
	; CHECK-NEXT: vadd.i8 d16, d17, d16			; CHECK-NEXT: vmul.i8 d16, d17, d16
	; CHECK-NEXT: vmov r0, r1, d16			; CHECK-NEXT: vmov r0, r1, d16
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <8 x i8>, <8 x i8>* %A			%tmp1 = load <8 x i8>, <8 x i8>* %A
	%tmp2 = load <8 x i8>, <8 x i8>* %B			%tmp2 = load <8 x i8>, <8 x i8>* %B
	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>			%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>			%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
	%tmp5 = add <8 x i8> %tmp3, %tmp4			%tmp5 = mul <8 x i8> %tmp3, %tmp4
	ret <8 x i8> %tmp5			ret <8 x i8> %tmp5
	}			}

	define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {			define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
	; CHECK-LABEL: vuzpi8_undef_Qres:			; CHECK-LABEL: vuzpi8_undef_Qres:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]			; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1]
	; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]			; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0]
	▲ Show 20 Lines • Show All 140 Lines • ▼ Show 20 Lines
	; CHECK: vbsl			; CHECK: vbsl
	%cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4			%cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
	%cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>			%cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
	%c0 = icmp ult <5 x i32> %cmp0, %cmp1			%c0 = icmp ult <5 x i32> %cmp0, %cmp1
	%c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>			%c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
	%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1			%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
	ret <10 x i8> %rv			ret <10 x i8> %rv
	}			}

				%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
				define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
				; CHECK-LABEL: vuzp_extract_subvector
				; CHECK: vmov
				; CHECK-NEXT: vmov
				; CHECK-NEXT: vorr
				; CHECK-NEXT: vuzp.8
				; CHECK-NEXT: vmov
				; CHECK-NEXT: vmov

				%vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
				%.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
				ret %struct.uint8x8x2_t %.fca.0.1.insert
				}

test/CodeGen/ARM/vzip.ll

Show First 20 Lines • Show All 303 Lines • ▼ Show 20 Lines	entry:
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>		%0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
store <4 x i32> %0, <4 x i32>* %B		store <4 x i32> %0, <4 x i32>* %B
ret void		ret void
}		}

define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {		define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {
entry:		entry:
; CHECK-LABEL: vzip_vext_factor		; CHECK-LABEL: vzip_vext_factor
; CHECK: vext.16 d16, d16, d17, #3		; CHECK: vld1.64
; CHECK: vzip		; CHECK-NEXT: vext.16
		; CHECK-NEXT: vext.16
		; CHECK-NEXT: vext.16
		; CHECK-NEXT: vstr
%tmp1 = load <8 x i16>, <8 x i16>* %A		%tmp1 = load <8 x i16>, <8 x i16>* %A
%0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3>		%0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3>
store <4 x i16> %0, <4 x i16>* %B		store <4 x i16> %0, <4 x i16>* %B
ret void		ret void
}		}