This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Target/ARM/
-
Target/
-
ARM/
-
ARMISelLowering.h
-
ARMISelLowering.cpp
-
test/CodeGen/ARM/
-
CodeGen/
-
ARM/
-
vext.ll
-
vpadd.ll
-
vuzp.ll
-
vzip.ll

Differential D27774

[ARM] Implement isExtractSubvectorCheap
ClosedPublic

Authored by efriedma on Dec 14 2016, 2:01 PM.

Download Raw Diff

Details

Reviewers

rengolin
t.p.northover
jmolloy

Commits

rGd03df8145f3b: [ARM] Implement isExtractSubvectorCheap.
rL290198: [ARM] Implement isExtractSubvectorCheap.

Summary

See https://reviews.llvm.org/D6678 for the history of isExtractSubvectorCheap. Essentially the same considerations apply to ARM.

This temporarily breaks the formation of vpadd/vpaddl in certain cases; AddCombineToVPADDL essentially assumes that we won't form VUZP shuffles. This is mostly orthogonal, though, so I'll fix it in a followup.

Diff Detail

Repository: rL LLVM

Event Timeline

efriedma updated this revision to Diff 81461.Dec 14 2016, 2:01 PM

efriedma retitled this revision from to [ARM] Implement isExtractSubvectorCheap.

efriedma updated this object.

efriedma added reviewers: rengolin, t.p.northover, jmolloy.

efriedma set the repository for this revision to rL LLVM.

efriedma added subscribers: mkuper, llvm-commits.

Herald added a subscriber: aemerson. · View Herald TranscriptDec 14 2016, 2:01 PM

efriedma added a child revision: D27779: [ARM] More aggressive matching for vpadd and vpaddl..Dec 14 2016, 3:31 PM

Ping.

Hi Eli,

Just making sure the vorr can't move past the vld1s, LGTM. Thanks!

Looking forward to the follow up to fix vpadd.

test/CodeGen/ARM/vext.ll
144 ↗	(On Diff #81461)	Currently, the `vmov.u16` gets moved in between the `vld1`s, so I worry that the compiler might try to do that again with the `vorr` and the `CHECK-NEXT` could fail.

This revision is now accepted and ready to land.Dec 20 2016, 3:38 AM

efriedma added inline comments.Dec 20 2016, 11:16 AM

test/CodeGen/ARM/vext.ll
144 ↗	(On Diff #81461)	I think I'll just generate the checks using update_llc_test_checks.py to make it easy to regenerate if a scheduler change makes it move for some reason. (I think we really need exhaustive checks for shuffles; it can be hard to spot regressions otherwise.)

Closed by commit rL290198: [ARM] Implement isExtractSubvectorCheap. (authored by efriedma). · Explain WhyDec 20 2016, 12:15 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

ARM/

ARMISelLowering.h

4 lines

ARMISelLowering.cpp

8 lines

test/

CodeGen/

ARM/

34 lines

55 lines

31 lines

5 lines

Diff 82130

llvm/trunk/lib/Target/ARM/ARMISelLowering.h

Show First 20 Lines • Show All 425 Lines • ▼ Show 20 Lines	bool getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,		const CallInst &I,
unsigned Intrinsic) const override;		unsigned Intrinsic) const override;

/// \brief Returns true if it is beneficial to convert a load of a constant		/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.		/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,		bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;		Type *Ty) const override;

		/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
		/// with this index.
		bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;

/// \brief Returns true if an argument of type Ty needs to be passed in a		/// \brief Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.		/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(		bool functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;		Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;

/// If a physical register, this returns the register that receives the		/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.		/// exception address on entry to an EH pad.
unsigned		unsigned
▲ Show 20 Lines • Show All 275 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,915 Lines • ▼ Show 20 Lines	bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
assert(Ty->isIntegerTy());		assert(Ty->isIntegerTy());

unsigned Bits = Ty->getPrimitiveSizeInBits();		unsigned Bits = Ty->getPrimitiveSizeInBits();
if (Bits == 0 \|\| Bits > 32)		if (Bits == 0 \|\| Bits > 32)
return false;		return false;
return true;		return true;
}		}

		bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
		unsigned Index) const {
		if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
		return false;

		return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
		}

Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,		Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
ARM_MB::MemBOpt Domain) const {		ARM_MB::MemBOpt Domain) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();		Module *M = Builder.GetInsertBlock()->getParent()->getParent();

// First, if the target has no DMB, see what fallback we can use.		// First, if the target has no DMB, see what fallback we can use.
if (!Subtarget->hasDataBarrier()) {		if (!Subtarget->hasDataBarrier()) {
// Some ARMv6 cpus can support data barriers with an mcr instruction.		// Some ARMv6 cpus can support data barriers with an mcr instruction.
// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get		// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
▲ Show 20 Lines • Show All 565 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/vext.ll

	Show First 20 Lines • Show All 211 Lines • ▼ Show 20 Lines

	; We should ignore a build_vector with more than two sources.			; We should ignore a build_vector with more than two sources.
	; Use illegal <32 x i16> type to produce such a shuffle after legalizing types.			; Use illegal <32 x i16> type to produce such a shuffle after legalizing types.
	; Try to look for fallback to by-element inserts.			; Try to look for fallback to by-element inserts.
	define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {			define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
	; CHECK-LABEL: test_multisource:			; CHECK-LABEL: test_multisource:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: mov r1, r0			; CHECK-NEXT: mov r1, r0
	; CHECK-NEXT: add r2, r0, #48			; CHECK-NEXT: add r2, r0, #32
	; CHECK-NEXT: add r0, r0, #32			; CHECK-NEXT: add r0, r0, #48
	; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!			; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
	; CHECK-NEXT: vld1.64 {d20, d21}, [r2:128]			; CHECK-NEXT: vld1.64 {d20, d21}, [r2:128]
	; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]			; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]
	; CHECK-NEXT: vmov.u16 r1, d16[0]			; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]
	; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]			; CHECK-NEXT: vorr d24, d20, d20
	; CHECK-NEXT: vmov.16 d22[0], r1			; CHECK-NEXT: vzip.16 d24, d18
	; CHECK-NEXT: vmov.u16 r0, d18[0]			; CHECK-NEXT: vext.16 d18, d20, d24, #2
	; CHECK-NEXT: vmov.u16 r1, d16[0]			; CHECK-NEXT: vtrn.16 q8, q11
	; CHECK-NEXT: vmov.16 d22[1], r0			; CHECK-NEXT: vext.16 d16, d18, d16, #2
	; CHECK-NEXT: vmov.u16 r0, d20[0]			; CHECK-NEXT: vext.16 d16, d16, d16, #2
	; CHECK-NEXT: vmov.16 d22[2], r1			; CHECK-NEXT: vmov r0, r1, d16
	; CHECK-NEXT: vmov.16 d22[3], r0
	; CHECK-NEXT: vmov r0, r1, d22
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <32 x i16>, <32 x i16>* %B			%tmp1 = load <32 x i16>, <32 x i16>* %B
	%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>			%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
	ret <4 x i16> %tmp2			ret <4 x i16> %tmp2
	}			}

	; We don't handle shuffles using more than half of a 128-bit vector.			; We don't handle shuffles using more than half of a 128-bit vector.
	; Again, test for fallback to by-element inserts.			; Again, test for fallback to by-element inserts.
	define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {			define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
	; CHECK-LABEL: test_largespan:			; CHECK-LABEL: test_largespan:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vld1.64 {d16, d17}, [r0]			; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
	; CHECK-NEXT: vmov.u16 r1, d16[0]			; CHECK-NEXT: vorr d18, d16, d16
	; CHECK-NEXT: vmov.u16 r0, d16[2]			; CHECK-NEXT: vuzp.16 d18, d17
	; CHECK-NEXT: vmov.16 d18[0], r1
	; CHECK-NEXT: vmov.u16 r1, d17[0]
	; CHECK-NEXT: vmov.16 d18[1], r0
	; CHECK-NEXT: vmov.u16 r0, d17[2]
	; CHECK-NEXT: vmov.16 d18[2], r1
	; CHECK-NEXT: vmov.16 d18[3], r0
	; CHECK-NEXT: vmov r0, r1, d18			; CHECK-NEXT: vmov r0, r1, d18
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <8 x i16>, <8 x i16>* %B			%tmp1 = load <8 x i16>, <8 x i16>* %B
	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>			%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
	ret <4 x i16> %tmp2			ret <4 x i16> %tmp2
	}			}

	; The actual shuffle code only handles some cases, make sure we check			; The actual shuffle code only handles some cases, make sure we check
	▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/vpadd.ll

	Show First 20 Lines • Show All 207 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: vmov r0, r1, d16			; CHECK-NEXT: vmov r0, r1, d16
	; CHECK-NEXT: vmov r2, r3, d17			; CHECK-NEXT: vmov r2, r3, d17
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%tmp1 = load <4 x i32>, <4 x i32>* %A			%tmp1 = load <4 x i32>, <4 x i32>* %A
	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)			%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
	ret <2 x i64> %tmp2			ret <2 x i64> %tmp2
	}			}

	; Test AddCombine optimization that generates a vpaddl.s			; Combine vuzp+vadd->vpadd.
	define void @addCombineToVPADDL() nounwind ssp {			; FIXME: Implement this optimization
	; CHECK-LABEL: addCombineToVPADDL:			define void @addCombineToVPADD(<16 x i8> %cbcr, <8 x i8> %X) nounwind ssp {
	; CHECK: @ BB#0:			; CHECK-LABEL: addCombineToVPADD:
	; CHECK-NEXT: .save {r11}			; CHECK: @ BB#0:
	; CHECK-NEXT: push {r11}			; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
	; CHECK-NEXT: .setfp r11, sp			; CHECK-NEXT: vorr d18, d17, d17
	; CHECK-NEXT: mov r11, sp			; CHECK-NEXT: vuzp.8 d16, d18
	; CHECK-NEXT: .pad #44			; CHECK-NEXT: vadd.i8 d16, d18, d16
	; CHECK-NEXT: sub sp, sp, #44			; CHECK-NEXT: vstr d16, [r1]
	; CHECK-NEXT: bic sp, sp, #15
	; CHECK-NEXT: add r0, sp, #16
	; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]
	; CHECK-NEXT: vpaddl.s8 q8, q8
	; CHECK-NEXT: vmovn.i16 d16, q8
	; CHECK-NEXT: vstr d16, [sp, #8]
	; CHECK-NEXT: mov sp, r11
	; CHECK-NEXT: pop {r11}
	; CHECK-NEXT: mov pc, lr			; CHECK-NEXT: mov pc, lr
	%cbcr = alloca <16 x i8>, align 16
	%X = alloca <8 x i8>, align 8
	%tmp = load <16 x i8>, <16 x i8>* %cbcr			%tmp = load <16 x i8>, <16 x i8>* %cbcr
	%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>			%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
	%tmp2 = load <16 x i8>, <16 x i8>* %cbcr			%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
	%add = add <8 x i8> %tmp3, %tmp1			%add = add <8 x i8> %tmp3, %tmp1
	store <8 x i8> %add, <8 x i8>* %X, align 8			store <8 x i8> %add, <8 x i8>* %X, align 8
	ret void			ret void
	}			}

				; Combine vuzp+vaddl->vpaddl
				; FIXME: Implement this optimization.
				define void @addCombineToVPADDL_sext(<16 x i8> %cbcr, <8 x i16> %X) nounwind ssp {
				; CHECK-LABEL: addCombineToVPADDL_sext:
				; CHECK: @ BB#0:
				; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
				; CHECK-NEXT: vorr d18, d17, d17
				; CHECK-NEXT: vuzp.8 d16, d18
				; CHECK-NEXT: vaddl.s8 q8, d18, d16
				; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
				; CHECK-NEXT: mov pc, lr
				%tmp = load <16 x i8>, <16 x i8>* %cbcr
				%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
				%tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
				%add = add <8 x i16> %tmp4, %tmp5
				store <8 x i16> %add, <8 x i16>* %X, align 8
				ret void
				}

	; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from			; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
	; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.			; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
	define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {			define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
	; CHECK-LABEL: fromExtendingExtractVectorElt:			; CHECK-LABEL: fromExtendingExtractVectorElt:
	; CHECK: @ BB#0:			; CHECK: @ BB#0:
	; CHECK-NEXT: vmov d16, r0, r1			; CHECK-NEXT: vmov d16, r0, r1
	; CHECK-NEXT: vpaddl.s16 d16, d16			; CHECK-NEXT: vpaddl.s16 d16, d16
	; CHECK-NEXT: vmov r0, r1, d16			; CHECK-NEXT: vmov r0, r1, d16
	Show All 22 Lines

llvm/trunk/test/CodeGen/ARM/vuzp.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \| FileCheck %s		; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \| FileCheck %s

define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {		define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-LABEL: vuzpi8:		; CHECK-LABEL: vuzpi8:
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NEXT: vldr d16, [r1]		; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]		; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vuzp.8 d17, d16		; CHECK-NEXT: vuzp.8 d17, d16
; CHECK-NEXT: vadd.i8 d16, d17, d16		; CHECK-NEXT: vmul.i8 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16		; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr		; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A		%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B		%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>		%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>		%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%tmp5 = add <8 x i8> %tmp3, %tmp4		%tmp5 = mul <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5		ret <8 x i8> %tmp5
}		}

define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {		define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-LABEL: vuzpi8_Qres:		; CHECK-LABEL: vuzpi8_Qres:
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NEXT: vldr d17, [r1]		; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]		; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vuzp.8 d16, d17		; CHECK-NEXT: vuzp.8 d16, d17
; CHECK-NEXT: vmov r0, r1, d16		; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17		; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr		; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A		%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B		%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>		%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <16 x i8> %tmp3		ret <16 x i8> %tmp3
}		}

define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {		define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
; CHECK-LABEL: vuzpi16:		; CHECK-LABEL: vuzpi16:
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NEXT: vldr d16, [r1]		; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]		; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vuzp.16 d17, d16		; CHECK-NEXT: vuzp.16 d17, d16
; CHECK-NEXT: vadd.i16 d16, d17, d16		; CHECK-NEXT: vmul.i16 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16		; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr		; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A		%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B		%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>		%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%tmp5 = add <4 x i16> %tmp3, %tmp4		%tmp5 = mul <4 x i16> %tmp3, %tmp4
ret <4 x i16> %tmp5		ret <4 x i16> %tmp5
}		}

define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {		define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
; CHECK-LABEL: vuzpi16_Qres:		; CHECK-LABEL: vuzpi16_Qres:
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NEXT: vldr d17, [r1]		; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]		; CHECK-NEXT: vldr d16, [r0]
▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
; Undef shuffle indices should not prevent matching to VUZP:		; Undef shuffle indices should not prevent matching to VUZP:

define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {		define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-LABEL: vuzpi8_undef:		; CHECK-LABEL: vuzpi8_undef:
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NEXT: vldr d16, [r1]		; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]		; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vuzp.8 d17, d16		; CHECK-NEXT: vuzp.8 d17, d16
; CHECK-NEXT: vadd.i8 d16, d17, d16		; CHECK-NEXT: vmul.i8 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16		; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr		; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A		%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B		%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>		%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>		%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
%tmp5 = add <8 x i8> %tmp3, %tmp4		%tmp5 = mul <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5		ret <8 x i8> %tmp5
}		}

define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {		define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
; CHECK-LABEL: vuzpi8_undef_Qres:		; CHECK-LABEL: vuzpi8_undef_Qres:
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NEXT: vldr d17, [r1]		; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]		; CHECK-NEXT: vldr d16, [r0]
▲ Show 20 Lines • Show All 319 Lines • ▼ Show 20 Lines	; CHECK-NEXT: .byte 10 @ 0xa
<5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {		<5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
%cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4		%cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>		%cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
%c0 = icmp ult <5 x i32> %cmp0, %cmp1		%c0 = icmp ult <5 x i32> %cmp0, %cmp1
%c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>		%c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1		%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
ret <10 x i8> %rv		ret <10 x i8> %rv
}		}

		%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
		define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
		; CHECK-LABEL: vuzp_extract_subvector:
		; CHECK: @ BB#0:
		; CHECK-NEXT: vmov d17, r2, r3
		; CHECK-NEXT: vmov d16, r0, r1
		; CHECK-NEXT: vorr d18, d17, d17
		; CHECK-NEXT: vuzp.8 d16, d18
		; CHECK-NEXT: vmov r0, r1, d16
		; CHECK-NEXT: vmov r2, r3, d18
		; CHECK-NEXT: mov pc, lr

		%vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
		%vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
		%.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
		%.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
		ret %struct.uint8x8x2_t %.fca.0.1.insert
		}

llvm/trunk/test/CodeGen/ARM/vzip.ll

Show First 20 Lines • Show All 326 Lines • ▼ Show 20 Lines	entry:
store <4 x i32> %0, <4 x i32>* %B		store <4 x i32> %0, <4 x i32>* %B
ret void		ret void
}		}

define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {		define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {
; CHECK-LABEL: vzip_vext_factor:		; CHECK-LABEL: vzip_vext_factor:
; CHECK: @ BB#0: @ %entry		; CHECK: @ BB#0: @ %entry
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]		; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
; CHECK-NEXT: vext.16 d16, d16, d17, #3		; CHECK-NEXT: vext.16 d18, d16, d17, #1
; CHECK-NEXT: vext.16 d17, d16, d16, #1		; CHECK-NEXT: vext.16 d16, d18, d17, #2
; CHECK-NEXT: vzip.16 d16, d17
; CHECK-NEXT: vext.16 d16, d16, d16, #1		; CHECK-NEXT: vext.16 d16, d16, d16, #1
; CHECK-NEXT: vstr d16, [r1]		; CHECK-NEXT: vstr d16, [r1]
; CHECK-NEXT: mov pc, lr		; CHECK-NEXT: mov pc, lr
entry:		entry:
%tmp1 = load <8 x i16>, <8 x i16>* %A		%tmp1 = load <8 x i16>, <8 x i16>* %A
%0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3>		%0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3>
store <4 x i16> %0, <4 x i16>* %B		store <4 x i16> %0, <4 x i16>* %B
ret void		ret void
Show All 20 Lines