This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Target/ARM/
-
Target/
-
ARM/
-
ARMISelLowering.cpp
-
test/CodeGen/ARM/
-
CodeGen/
-
ARM/
-
big-endian-vector-callee.ll
-
vcombine.ll
-
vext.ll
-
vtrn.ll

Differential D27624

[ARM] Split 128-bit vectors in BUILD_VECTOR lowering
ClosedPublic

Authored by efriedma on Dec 9 2016, 11:25 AM.

Download Raw Diff

Details

Reviewers

rengolin
t.p.northover
ab
jmolloy

Commits

rGcbed30c5012b: [ARM] Split 128-bit vectors in BUILD_VECTOR lowering
rL289706: [ARM] Split 128-bit vectors in BUILD_VECTOR lowering

Summary

Given that INSERT_VECTOR_ELT operates on D registers anyway, combining 64-bit vectors into a 128-bit vector is basically free. Therefore, try to split BUILD_VECTOR nodes before giving up and lowering them to a series of INSERT_VECTOR_ELT instructions. Sometimes this allows dramatically better lowerings; see testcases for examples. Inspired by similar code in the x86 backend for AVX.

For the @vcombine_vdup, I'm not happy with the DAGCombine transforms which produce a BUILD_VECTOR in the first place; we're taking splat shuffles which were carefully preserved in the IR, and destroying them in DAGCombine by transforming concat_vec(splat(a), splat(a)) -> concat_vec(build_vector(a,a,a,a), build_vector(b,b,b,b)) -> build_vector(a,a,a,a,b,b,b,b). Maybe we could improve this somehow?

Diff Detail

Repository: rL LLVM

Event Timeline

efriedma updated this revision to Diff 80918.Dec 9 2016, 11:25 AM

efriedma retitled this revision from to [ARM] Split 128-bit vectors in BUILD_VECTOR lowering.

efriedma updated this object.

efriedma added reviewers: ab, t.p.northover, jmolloy.

efriedma set the repository for this revision to rL LLVM.

efriedma added subscribers: llvm-commits, RKSimon, craig.topper.

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptDec 9 2016, 11:25 AM

Hi Eli,

This change looks good to me. I just have a small comment inline.

cheers,
--renato

lib/Target/ARM/ARMISelLowering.cpp
6232 ↗	(On Diff #80918)	I'm assuming `LowerBUILD_VECTOR` can return `SDNode()` if there's no need for it, thus the concat only happening if both were lowered. Is this is your strategy around "we might discover a better way to lower it"? How can we avoid cases where it doesn't?

efriedma added inline comments.Dec 14 2016, 10:40 AM

lib/Target/ARM/ARMISelLowering.cpp
6232 ↗	(On Diff #80918)	LowerBUILD_VECTOR can return SDNode() in some cases... I think in practice, though, it only fails for splats and constants. Mostly, I'm just depending on the fact that the worst case isn't any worse than what we would do anyway; a series of INSERT_VECTOR_ELT operations on two 64-bit vectors is roughly equivalent to a series of INSERT_VECTOR_ELT operations on one 128-bit vector.

LGTM. Thanks!

lib/Target/ARM/ARMISelLowering.cpp
6232 ↗	(On Diff #80918)	Makes sense.

This revision is now accepted and ready to land.Dec 14 2016, 11:20 AM

Closed by commit rL289706: [ARM] Split 128-bit vectors in BUILD_VECTOR lowering (authored by efriedma). · Explain WhyDec 14 2016, 12:55 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

ARM/

ARMISelLowering.cpp

21 lines

test/

CodeGen/

ARM/

big-endian-vector-callee.ll

24 lines

vcombine.ll

18 lines

vext.ll

25 lines

vtrn.ll

22 lines

Diff 81444

llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,047 Lines • ▼ Show 20 Lines	SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());		BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
SDLoc dl(Op);		SDLoc dl(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();

APInt SplatBits, SplatUndef;		APInt SplatBits, SplatUndef;
unsigned SplatBitSize;		unsigned SplatBitSize;
bool HasAnyUndefs;		bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {		if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
		if (SplatUndef.isAllOnesValue())
		return DAG.getUNDEF(VT);

if (SplatBitSize <= 64) {		if (SplatBitSize <= 64) {
// Check if an immediate VMOV works.		// Check if an immediate VMOV works.
EVT VmovVT;		EVT VmovVT;
SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),		SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,		SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),		DAG, dl, VmovVT, VT.is128BitVector(),
VMOVModImm);		VMOVModImm);
if (Val.getNode()) {		if (Val.getNode()) {
▲ Show 20 Lines • Show All 145 Lines • ▼ Show 20 Lines	SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,

// Empirical tests suggest this is rarely worth it for vectors of length <= 2.		// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {		if (NumElts >= 4) {
SDValue shuffle = ReconstructShuffle(Op, DAG);		SDValue shuffle = ReconstructShuffle(Op, DAG);
if (shuffle != SDValue())		if (shuffle != SDValue())
return shuffle;		return shuffle;
}		}

		if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
		// If we haven't found an efficient lowering, try splitting a 128-bit vector
		// into two 64-bit vectors; we might discover a better way to lower it.
		SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
		EVT ExtVT = VT.getVectorElementType();
		EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
		SDValue Lower =
		DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
		if (Lower.getOpcode() == ISD::BUILD_VECTOR)
		Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
		SDValue Upper = DAG.getBuildVector(
		HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
		if (Upper.getOpcode() == ISD::BUILD_VECTOR)
		Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
		if (Lower && Upper)
		return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
		}

// Vectors with 32- or 64-bit elements can be built by directly assigning		// Vectors with 32- or 64-bit elements can be built by directly assigning
// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands		// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
// will be legalized.		// will be legalized.
if (EltSize >= 32) {		if (EltSize >= 32) {
// Do the expansion with floating-point types, since that is what the VFP		// Do the expansion with floating-point types, since that is what the VFP
// registers are defined to use, and since i64 is not legal.		// registers are defined to use, and since i64 is not legal.
EVT EltVT = EVT::getFloatingPointVT(EltSize);		EVT EltVT = EVT::getFloatingPointVT(EltSize);
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);		EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
▲ Show 20 Lines • Show All 7,229 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/big-endian-vector-callee.ll

Show First 20 Lines • Show All 646 Lines • ▼ Show 20 Lines	; HARD: vrev64.8 q{{[0-9]+}}, q0
%3 = fadd fp128 %2, %2		%3 = fadd fp128 %2, %2
ret fp128 %3		ret fp128 %3
; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]		; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]		; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
}		}

; CHECK-LABEL: test_v2f64_f128:		; CHECK-LABEL: test_v2f64_f128:
define <2 x double> @test_v2f64_f128(fp128 %p) {		define <2 x double> @test_v2f64_f128(fp128 %p) {
; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG1]][1], r1
; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2		; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
		; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG2]][1], r3		; CHECK: vmov.32 [[REG2]][1], r3
		; CHECK: vmov.32 [[REG1]][1], r1
%1 = fadd fp128 %p, %p		%1 = fadd fp128 %p, %p
%2 = bitcast fp128 %1 to <2 x double>		%2 = bitcast fp128 %1 to <2 x double>
%3 = fadd <2 x double> %2, %2		%3 = fadd <2 x double> %2, %2
ret <2 x double> %3		ret <2 x double> %3
; SOFT: vadd.f64 [[REG1:d[0-9]+]]		; SOFT: vadd.f64 [[REG1:d[0-9]+]]
; SOFT: vadd.f64 [[REG2:d[0-9]+]]		; SOFT: vadd.f64 [[REG2:d[0-9]+]]
; SOFT: vmov r1, r0, [[REG2]]		; SOFT: vmov r1, r0, [[REG2]]
; SOFT: vmov r3, r2, [[REG1]]		; SOFT: vmov r3, r2, [[REG1]]
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines
; SOFT: vmov r1, r0, [[REG2]]		; SOFT: vmov r1, r0, [[REG2]]
; SOFT: vmov r3, r2, [[REG1]]		; SOFT: vmov r3, r2, [[REG1]]
; HARD: vadd.f64 d1		; HARD: vadd.f64 d1
; HARD: vadd.f64 d0		; HARD: vadd.f64 d0
}		}

; CHECK-LABEL: test_v2i64_f128:		; CHECK-LABEL: test_v2i64_f128:
define <2 x i64> @test_v2i64_f128(fp128 %p) {		define <2 x i64> @test_v2i64_f128(fp128 %p) {
; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG1]][1], r1
; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2		; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
		; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG2]][1], r3		; CHECK: vmov.32 [[REG2]][1], r3
		; CHECK: vmov.32 [[REG1]][1], r1
%1 = fadd fp128 %p, %p		%1 = fadd fp128 %p, %p
%2 = bitcast fp128 %1 to <2 x i64>		%2 = bitcast fp128 %1 to <2 x i64>
%3 = add <2 x i64> %2, %2		%3 = add <2 x i64> %2, %2
ret <2 x i64> %3		ret <2 x i64> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vadd.i64 q0		; HARD: vadd.i64 q0
}		}
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	; HARD: vrev64.8 q{{[0-9]+}}, q0
ret <2 x i64> %3		ret <2 x i64> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vadd.i64 q0		; HARD: vadd.i64 q0
}		}

; CHECK-LABEL: test_v4f32_f128:		; CHECK-LABEL: test_v4f32_f128:
define <4 x float> @test_v4f32_f128(fp128 %p) {		define <4 x float> @test_v4f32_f128(fp128 %p) {
; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG1]][1], r1
; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2		; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
		; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG2]][1], r3		; CHECK: vmov.32 [[REG2]][1], r3
		; CHECK: vmov.32 [[REG1]][1], r1
%1 = fadd fp128 %p, %p		%1 = fadd fp128 %p, %p
%2 = bitcast fp128 %1 to <4 x float>		%2 = bitcast fp128 %1 to <4 x float>
%3 = fadd <4 x float> %2, %2		%3 = fadd <4 x float> %2, %2
ret <4 x float> %3		ret <4 x float> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.32 q0		; HARD: vrev64.32 q0
}		}
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	; HARD: vrev64.8 q{{[0-9]+}}, q0
ret <4 x float> %3		ret <4 x float> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.32 q0		; HARD: vrev64.32 q0
}		}

; CHECK-LABEL: test_v4i32_f128:		; CHECK-LABEL: test_v4i32_f128:
define <4 x i32> @test_v4i32_f128(fp128 %p) {		define <4 x i32> @test_v4i32_f128(fp128 %p) {
; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG1]][1], r1
; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2		; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
		; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG2]][1], r3		; CHECK: vmov.32 [[REG2]][1], r3
		; CHECK: vmov.32 [[REG1]][1], r1
%1 = fadd fp128 %p, %p		%1 = fadd fp128 %p, %p
%2 = bitcast fp128 %1 to <4 x i32>		%2 = bitcast fp128 %1 to <4 x i32>
%3 = add <4 x i32> %2, %2		%3 = add <4 x i32> %2, %2
ret <4 x i32> %3		ret <4 x i32> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.32 q0		; HARD: vrev64.32 q0
}		}
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	; HARD: vrev64.8 q{{[0-9]+}}, q0
ret <4 x i32> %3		ret <4 x i32> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.32 q0		; HARD: vrev64.32 q0
}		}

; CHECK-LABEL: test_v8i16_f128:		; CHECK-LABEL: test_v8i16_f128:
define <8 x i16> @test_v8i16_f128(fp128 %p) {		define <8 x i16> @test_v8i16_f128(fp128 %p) {
; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG1]][1], r1
; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2		; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
		; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG2]][1], r3		; CHECK: vmov.32 [[REG2]][1], r3
		; CHECK: vmov.32 [[REG1]][1], r1
%1 = fadd fp128 %p, %p		%1 = fadd fp128 %p, %p
%2 = bitcast fp128 %1 to <8 x i16>		%2 = bitcast fp128 %1 to <8 x i16>
%3 = add <8 x i16> %2, %2		%3 = add <8 x i16> %2, %2
ret <8 x i16> %3		ret <8 x i16> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.16 q0		; HARD: vrev64.16 q0
}		}
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	; HARD: vrev64.8 q{{[0-9]+}}, q0
ret <8 x i16> %3		ret <8 x i16> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.16 q0		; HARD: vrev64.16 q0
}		}

; CHECK-LABEL: test_v16i8_f128:		; CHECK-LABEL: test_v16i8_f128:
define <16 x i8> @test_v16i8_f128(fp128 %p) {		define <16 x i8> @test_v16i8_f128(fp128 %p) {
; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG1]][1], r1
; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2		; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
		; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
; CHECK: vmov.32 [[REG2]][1], r3		; CHECK: vmov.32 [[REG2]][1], r3
		; CHECK: vmov.32 [[REG1]][1], r1
%1 = fadd fp128 %p, %p		%1 = fadd fp128 %p, %p
%2 = bitcast fp128 %1 to <16 x i8>		%2 = bitcast fp128 %1 to <16 x i8>
%3 = add <16 x i8> %2, %2		%3 = add <16 x i8> %2, %2
ret <16 x i8> %3		ret <16 x i8> %3
; SOFT: vmov r1, r0		; SOFT: vmov r1, r0
; SOFT: vmov r3, r2		; SOFT: vmov r3, r2
; HARD: vrev64.8 q0		; HARD: vrev64.8 q0
}		}
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/vcombine.ll

	Show First 20 Lines • Show All 99 Lines • ▼ Show 20 Lines
	; CHECK: vget_high8			; CHECK: vget_high8
	; CHECK-NOT: vst			; CHECK-NOT: vst
	; CHECK-LE: vmov r0, r1, d17			; CHECK-LE: vmov r0, r1, d17
	; CHECK-BE: vmov r1, r0, d16			; CHECK-BE: vmov r1, r0, d16
	%tmp1 = load <16 x i8>, <16 x i8>* %A			%tmp1 = load <16 x i8>, <16 x i8>* %A
	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <8 x i8> %tmp2			ret <8 x i8> %tmp2
	}			}

				; vcombine(vld1_dup(p), vld1_dup(p2))
				define <8 x i16> @vcombine_vdup(<8 x i16> %src, i16* nocapture readonly %p) {
				; CHECK-LABEL: vcombine_vdup:
				; CHECK: vld1.16 {d16[]},
				; CHECK: vld1.16 {d17[]},
				; CHECK-LE: vmov r0, r1, d16
				; CHECK-LE: vmov r2, r3, d17
				%a1 = load i16, i16* %p, align 2
				%a2 = insertelement <4 x i16> undef, i16 %a1, i32 0
				%a3 = shufflevector <4 x i16> %a2, <4 x i16> undef, <4 x i32> zeroinitializer
				%p2 = getelementptr inbounds i16, i16* %p, i32 1
				%b1 = load i16, i16* %p2, align 2
				%b2 = insertelement <4 x i16> undef, i16 %b1, i32 0
				%b3 = shufflevector <4 x i16> %b2, <4 x i16> undef, <4 x i32> zeroinitializer
				%shuffle = shufflevector <4 x i16> %a3, <4 x i16> %b3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
				ret <8 x i16> %shuffle
				}

llvm/trunk/test/CodeGen/ARM/vext.ll

Show First 20 Lines • Show All 158 Lines • ▼ Show 20 Lines	;CHECK: vmov.16 [[REG]][3]
%tmp1 = load <8 x i16>, <8 x i16>* %B		%tmp1 = load <8 x i16>, <8 x i16>* %B
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>		%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
ret <4 x i16> %tmp2		ret <4 x i16> %tmp2
}		}

; The actual shuffle code only handles some cases, make sure we check		; The actual shuffle code only handles some cases, make sure we check
; this rather than blindly emitting a VECTOR_SHUFFLE (infinite		; this rather than blindly emitting a VECTOR_SHUFFLE (infinite
; lowering loop can result otherwise).		; lowering loop can result otherwise).
		; (There are probably better ways to lower this shuffle, but it's not
		; really important.)
define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind {		define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK-LABEL: test_illegal:		;CHECK-LABEL: test_illegal:
;CHECK: vmov.16 [[REG:d[0-9]+]][0]		;CHECK: vmov.u16
;CHECK: vmov.16 [[REG]][1]		;CHECK-NEXT: vmov.u16
;CHECK: vmov.16 [[REG]][2]		;CHECK-NEXT: vorr
;CHECK: vmov.16 [[REG]][3]		;CHECK-NEXT: vorr
;CHECK: vmov.16 [[REG2:d[0-9]+]][0]		;CHECK-NEXT: vmov.16
;CHECK: vmov.16 [[REG2]][1]		;CHECK-NEXT: vuzp.16
;CHECK: vmov.16 [[REG2]][2]		;CHECK-NEXT: vmov.u16
;CHECK: vmov.16 [[REG2]][3]		;CHECK-NEXT: vmov.16
		;CHECK-NEXT: vuzp.16
		;CHECK-NEXT: vmov.16
		;CHECK-NEXT: vmov.u16
		;CHECK-NEXT: vext.16
		;CHECK-NEXT: vmov.16
		;CHECK-NEXT: vmov r0, r1, d
		;CHECK-NEXT: vmov r2, r3, d
%tmp1 = load <8 x i16>, <8 x i16>* %A		%tmp1 = load <8 x i16>, <8 x i16>* %A
%tmp2 = load <8 x i16>, <8 x i16>* %B		%tmp2 = load <8 x i16>, <8 x i16>* %B
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 5, i32 13, i32 3, i32 2, i32 2, i32 9>		%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 5, i32 13, i32 3, i32 2, i32 2, i32 9>
ret <8 x i16> %tmp3		ret <8 x i16> %tmp3
}		}

; PR11129		; PR11129
; Make sure this doesn't crash		; Make sure this doesn't crash
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/vtrn.ll

Show First 20 Lines • Show All 366 Lines • ▼ Show 20 Lines	define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4		%cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>		%cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
%c0 = icmp ult <4 x i32> %cmp0, %cmp1		%c0 = icmp ult <4 x i32> %cmp0, %cmp1
%c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>		%c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1		%rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
ret <8 x i8> %rv		ret <8 x i8> %rv
}		}

; Negative test that should not generate a vtrn		; The shuffle mask is half a vtrn; we duplicate the half to produce the
		; full result.
define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {		define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
entry:		entry:
; CHECK-LABEL: lower_twice_no_vtrn		; CHECK-LABEL: lower_twice_no_vtrn
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NOT: vtrn		; CHECK-NEXT: vldr d16, [r1]
; CHECK: mov pc, lr		; CHECK-NEXT: vldr d18, [r0]
		; CHECK-NEXT: vtrn.16 d18, d16
		; CHECK-NEXT: vorr d17, d16, d16
		; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
		; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A		%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B		%tmp2 = load <4 x i16>, <4 x i16>* %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>		%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
store <8 x i16> %0, <8 x i16>* %C		store <8 x i16> %0, <8 x i16>* %C
ret void		ret void
}		}

; Negative test that should not generate a vtrn		; The shuffle mask is half a vtrn; we duplicate the half to produce the
		; full result.
define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {		define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
entry:		entry:
; CHECK-LABEL: upper_twice_no_vtrn		; CHECK-LABEL: upper_twice_no_vtrn
; CHECK: @ BB#0:		; CHECK: @ BB#0:
; CHECK-NOT: vtrn		; CHECK-NEXT: vldr d16, [r1]
; CHECK: mov pc, lr		; CHECK-NEXT: vldr d18, [r0]
		; CHECK-NEXT: vtrn.16 d18, d16
		; CHECK-NEXT: vorr d19, d18, d18
		; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
		; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A		%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B		%tmp2 = load <4 x i16>, <4 x i16>* %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>		%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
store <8 x i16> %0, <8 x i16>* %C		store <8 x i16> %0, <8 x i16>* %C
ret void		ret void
}		}