This is an archive of the discontinued LLVM Phabricator instance.

[x86] use more broadcasts to load a scalar into vector reg
ClosedPublic

Authored by spatel on Aug 22 2018, 2:03 PM.

Download Raw Diff

Details

Reviewers

craig.topper
efriedma
RKSimon

Commits

rG8a84c747d2de: [x86] try harder to use broadcast to load a scalar into vector reg
rL340685: [x86] try harder to use broadcast to load a scalar into vector reg

Summary

This is a preliminary step for a preliminary step for D50992. I noticed that x86 often misses chances to load a scalar directly into a vector register.

So this patch is just allowing more of those cases to match a broadcast op in lowerBuildVectorAsBroadcast(). The old code comment said it doesn't make sense to use a broadcast when we're loading a single element and everything else is undef, but I think that's the best case in the improved tests in insert-loaded-scalar.ll. We avoid scalar-to-vector-register move and/or less efficient shuffling.

Note that there are some existing types that were already producing a broadcast, but that happens semi-accidentally. Ie, it's not happening as part of lowerBuildVectorAsBroadcast(). The build vector gets expanded into load + shuffle, and then shuffle lowering produces the broadcast.

Description of the other test diffs:

avx-basic.ll - replacing load+shufle is a win.
sse3-avx-addsub-2.ll - vmovddup vs. vbroadcastss is neutral?
sse41.ll - don't care? we convert that intrinsic to generic IR now, so this test is deprecated?
vector-shuffle-128-v8.ll / vector-shuffle-256-v16.ll - do we consider the pshufb alternatives with an extra instruction a regression or a win?

Diff Detail

Repository: rL LLVM

Event Timeline

spatel created this revision.Aug 22 2018, 2:03 PM

Herald added a subscriber: mcrosier. · View Herald TranscriptAug 22 2018, 2:03 PM

spatel mentioned this in rL340565: [x86] move/add tests for insertelement with variable index; NFC.Aug 23 2018, 11:39 AM

spatel mentioned this in D51186: [x86] turn insertelement into undef with variable index into splat.Aug 23 2018, 2:22 PM

LGTM

This revision is now accepted and ready to land.Aug 24 2018, 10:37 AM

spatel mentioned this in D50992: [InstCombine] try to fold insertelt + vector op into scalar op + insertelt.Aug 24 2018, 11:42 AM

Closed by commit rL340685: [x86] try harder to use broadcast to load a scalar into vector reg (authored by spatel). · Explain WhyAug 25 2018, 7:57 AM

This revision was automatically updated to reflect the committed changes.

spatel mentioned this in rL340705: [SelectionDAG][x86] turn insertelement into undef with variable index into splat.Aug 26 2018, 11:21 AM

spatel mentioned this in D51553: [DAGCombiner][x86] add transform/hook to load a scalar directly for use in a vector binop.Aug 31 2018, 12:38 PM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

18 lines

test/

CodeGen/

X86/

avx-basic.ll

3 lines

insert-loaded-scalar.ll

163 lines

sse3-avx-addsub-2.ll

22 lines

sse41.ll

10 lines

vector-shuffle-128-v8.ll

5 lines

vector-shuffle-256-v16.ll

5 lines

Diff 162549

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,116 Lines • ▼ Show 20 Lines	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
SDValue Brdcst =		SDValue Brdcst =
DAG.getNode(X86ISD::VBROADCASTM, dl,		DAG.getNode(X86ISD::VBROADCASTM, dl,
MVT::getVectorVT(EltType, NumElts), BOperand);		MVT::getVectorVT(EltType, NumElts), BOperand);
return DAG.getBitcast(VT, Brdcst);		return DAG.getBitcast(VT, Brdcst);
}		}
}		}
}		}

// We need a splat of a single value to use broadcast, and it doesn't		unsigned NumElts = VT.getVectorNumElements();
// make any sense if the value is only in one element of the vector.		unsigned NumUndefElts = UndefElements.count();
if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {		if (!Ld \|\| (NumElts - NumUndefElts) <= 1) {
APInt SplatValue, Undef;		APInt SplatValue, Undef;
unsigned SplatBitSize;		unsigned SplatBitSize;
bool HasUndef;		bool HasUndef;
// Check if this is a repeated constant pattern suitable for broadcasting.		// Check if this is a repeated constant pattern suitable for broadcasting.
if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&		if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
SplatBitSize > VT.getScalarSizeInBits() &&		SplatBitSize > VT.getScalarSizeInBits() &&
SplatBitSize < VT.getSizeInBits()) {		SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle		// Avoid replacing with broadcast when it's a use of a shuffle
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,		MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),		MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);		Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);		SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
return DAG.getBitcast(VT, Brdcst);		return DAG.getBitcast(VT, Brdcst);
}		}
}		}
}		}

		// If we are moving a scalar into a vector (Ld must be set and all elements
		// but 1 are undef) and that operation is not obviously supported by
		// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
		// That's better than general shuffling and may eliminate a load to GPR and
		// move from scalar to vector register.
		if (!Ld \|\| NumElts - NumUndefElts != 1)
		return SDValue();
		unsigned ScalarSize = Ld.getValueSizeInBits();
		if (!(UndefElements[0] \|\| (ScalarSize != 32 && ScalarSize != 64)))
return SDValue();		return SDValue();
}		}

bool ConstSplatVal =		bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);		(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

// Make sure that all of the users of a non-constant load are from the		// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.		// BUILD_VECTOR node.
if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))		if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
▲ Show 20 Lines • Show All 34,009 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx-basic.ll

Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>		%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
ret <4 x i64> %shuffle		ret <4 x i64> %shuffle
}		}

;;; Don't crash on movd		;;; Don't crash on movd
define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {		define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
; CHECK-LABEL: VMOVZQI2PQI:		; CHECK-LABEL: VMOVZQI2PQI:
; CHECK: ## %bb.0:		; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero		; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*		%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4		%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1		%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1
%ptrcast.i23.i = bitcast float* %ptroffset.i22.i992 to i32*		%ptrcast.i23.i = bitcast float* %ptroffset.i22.i992 to i32*
%val.i24.i = load i32, i32* %ptrcast.i23.i, align 4		%val.i24.i = load i32, i32* %ptrcast.i23.i, align 4
%updatedret.i30.i = insertelement <8 x i32> undef, i32 %val.i34.i, i32 1		%updatedret.i30.i = insertelement <8 x i32> undef, i32 %val.i34.i, i32 1
ret <8 x i32> %updatedret.i30.i		ret <8 x i32> %updatedret.i30.i
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/insert-loaded-scalar.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 \| FileCheck %s --check-prefixes=ALL,SSE			; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 \| FileCheck %s --check-prefixes=ALL,SSE
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx \| FileCheck %s --check-prefixes=ALL,AVX,AVX1			; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx \| FileCheck %s --check-prefixes=ALL,AVX,AVX1
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 \| FileCheck %s --check-prefixes=ALL,AVX,AVX2			; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 \| FileCheck %s --check-prefixes=ALL,AVX,AVX2

	define <16 x i8> @load8_ins_elt0_v16i8(i8* %p) nounwind {			define <16 x i8> @load8_ins_elt0_v16i8(i8* %p) nounwind {
	; SSE-LABEL: load8_ins_elt0_v16i8:			; SSE-LABEL: load8_ins_elt0_v16i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movzbl (%rdi), %eax			; SSE-NEXT: movzbl (%rdi), %eax
	; SSE-NEXT: movd %eax, %xmm0			; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load8_ins_elt0_v16i8:			; AVX1-LABEL: load8_ins_elt0_v16i8:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: movzbl (%rdi), %eax			; AVX1-NEXT: movzbl (%rdi), %eax
	; AVX-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: retq			; AVX1-NEXT: retq
				;
				; AVX2-LABEL: load8_ins_elt0_v16i8:
				; AVX2: # %bb.0:
				; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
				; AVX2-NEXT: retq
	%x = load i8, i8* %p			%x = load i8, i8* %p
	%ins = insertelement <16 x i8> undef, i8 %x, i32 0			%ins = insertelement <16 x i8> undef, i8 %x, i32 0
	ret <16 x i8> %ins			ret <16 x i8> %ins
	}			}

	define <8 x i16> @load16_ins_elt0_v8i16(i16* %p) nounwind {			define <8 x i16> @load16_ins_elt0_v8i16(i16* %p) nounwind {
	; SSE-LABEL: load16_ins_elt0_v8i16:			; SSE-LABEL: load16_ins_elt0_v8i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movzwl (%rdi), %eax			; SSE-NEXT: movzwl (%rdi), %eax
	; SSE-NEXT: movd %eax, %xmm0			; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load16_ins_elt0_v8i16:			; AVX1-LABEL: load16_ins_elt0_v8i16:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: movzwl (%rdi), %eax			; AVX1-NEXT: movzwl (%rdi), %eax
	; AVX-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: retq			; AVX1-NEXT: retq
				;
				; AVX2-LABEL: load16_ins_elt0_v8i16:
				; AVX2: # %bb.0:
				; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
				; AVX2-NEXT: retq
	%x = load i16, i16* %p			%x = load i16, i16* %p
	%ins = insertelement <8 x i16> undef, i16 %x, i32 0			%ins = insertelement <8 x i16> undef, i16 %x, i32 0
	ret <8 x i16> %ins			ret <8 x i16> %ins
	}			}

	define <4 x i32> @load32_ins_elt0_v4i32(i32* %p) nounwind {			define <4 x i32> @load32_ins_elt0_v4i32(i32* %p) nounwind {
	; SSE-LABEL: load32_ins_elt0_v4i32:			; SSE-LABEL: load32_ins_elt0_v4i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines
	define <16 x i8> @load8_ins_eltc_v16i8(i8* %p) nounwind {			define <16 x i8> @load8_ins_eltc_v16i8(i8* %p) nounwind {
	; SSE-LABEL: load8_ins_eltc_v16i8:			; SSE-LABEL: load8_ins_eltc_v16i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movzbl (%rdi), %eax			; SSE-NEXT: movzbl (%rdi), %eax
	; SSE-NEXT: movd %eax, %xmm0			; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: pslld $24, %xmm0			; SSE-NEXT: pslld $24, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load8_ins_eltc_v16i8:			; AVX1-LABEL: load8_ins_eltc_v16i8:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: movzbl (%rdi), %eax			; AVX1-NEXT: movzbl (%rdi), %eax
	; AVX-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: vpslld $24, %xmm0, %xmm0			; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX1-NEXT: retq
				;
				; AVX2-LABEL: load8_ins_eltc_v16i8:
				; AVX2: # %bb.0:
				; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
				; AVX2-NEXT: retq
	%x = load i8, i8* %p			%x = load i8, i8* %p
	%ins = insertelement <16 x i8> undef, i8 %x, i32 3			%ins = insertelement <16 x i8> undef, i8 %x, i32 3
	ret <16 x i8> %ins			ret <16 x i8> %ins
	}			}

	define <8 x i16> @load16_ins_eltc_v8i16(i16* %p) nounwind {			define <8 x i16> @load16_ins_eltc_v8i16(i16* %p) nounwind {
	; SSE-LABEL: load16_ins_eltc_v8i16:			; SSE-LABEL: load16_ins_eltc_v8i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	Show All 20 Lines

	define <4 x i32> @load32_ins_eltc_v4i32(i32* %p) nounwind {			define <4 x i32> @load32_ins_eltc_v4i32(i32* %p) nounwind {
	; SSE-LABEL: load32_ins_eltc_v4i32:			; SSE-LABEL: load32_ins_eltc_v4i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]			; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: load32_ins_eltc_v4i32:			; AVX-LABEL: load32_ins_eltc_v4i32:
	; AVX1: # %bb.0:			; AVX: # %bb.0:
	; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; AVX-NEXT: vbroadcastss (%rdi), %xmm0
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]			; AVX-NEXT: retq
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: load32_ins_eltc_v4i32:
	; AVX2: # %bb.0:
	; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
	; AVX2-NEXT: retq
	%x = load i32, i32* %p			%x = load i32, i32* %p
	%ins = insertelement <4 x i32> undef, i32 %x, i32 2			%ins = insertelement <4 x i32> undef, i32 %x, i32 2
	ret <4 x i32> %ins			ret <4 x i32> %ins
	}			}

	define <2 x i64> @load64_ins_eltc_v2i64(i64* %p) nounwind {			define <2 x i64> @load64_ins_eltc_v2i64(i64* %p) nounwind {
	; SSE-LABEL: load64_ins_eltc_v2i64:			; SSE-LABEL: load64_ins_eltc_v2i64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines

	define <32 x i8> @load8_ins_elt0_v32i8(i8* %p) nounwind {			define <32 x i8> @load8_ins_elt0_v32i8(i8* %p) nounwind {
	; SSE-LABEL: load8_ins_elt0_v32i8:			; SSE-LABEL: load8_ins_elt0_v32i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movzbl (%rdi), %eax			; SSE-NEXT: movzbl (%rdi), %eax
	; SSE-NEXT: movd %eax, %xmm0			; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load8_ins_elt0_v32i8:			; AVX1-LABEL: load8_ins_elt0_v32i8:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: movzbl (%rdi), %eax			; AVX1-NEXT: movzbl (%rdi), %eax
	; AVX-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: retq			; AVX1-NEXT: retq
				;
				; AVX2-LABEL: load8_ins_elt0_v32i8:
				; AVX2: # %bb.0:
				; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
				; AVX2-NEXT: retq
	%x = load i8, i8* %p			%x = load i8, i8* %p
	%ins = insertelement <32 x i8> undef, i8 %x, i32 0			%ins = insertelement <32 x i8> undef, i8 %x, i32 0
	ret <32 x i8> %ins			ret <32 x i8> %ins
	}			}

	define <16 x i16> @load16_ins_elt0_v16i16(i16* %p) nounwind {			define <16 x i16> @load16_ins_elt0_v16i16(i16* %p) nounwind {
	; SSE-LABEL: load16_ins_elt0_v16i16:			; SSE-LABEL: load16_ins_elt0_v16i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movzwl (%rdi), %eax			; SSE-NEXT: movzwl (%rdi), %eax
	; SSE-NEXT: movd %eax, %xmm0			; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load16_ins_elt0_v16i16:			; AVX1-LABEL: load16_ins_elt0_v16i16:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: movzwl (%rdi), %eax			; AVX1-NEXT: movzwl (%rdi), %eax
	; AVX-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: retq			; AVX1-NEXT: retq
				;
				; AVX2-LABEL: load16_ins_elt0_v16i16:
				; AVX2: # %bb.0:
				; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
				; AVX2-NEXT: retq
	%x = load i16, i16* %p			%x = load i16, i16* %p
	%ins = insertelement <16 x i16> undef, i16 %x, i32 0			%ins = insertelement <16 x i16> undef, i16 %x, i32 0
	ret <16 x i16> %ins			ret <16 x i16> %ins
	}			}

	define <8 x i32> @load32_ins_elt0_v8i32(i32* %p) nounwind {			define <8 x i32> @load32_ins_elt0_v8i32(i32* %p) nounwind {
	; SSE-LABEL: load32_ins_elt0_v8i32:			; SSE-LABEL: load32_ins_elt0_v8i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: movzbl (%rdi), %eax			; AVX1-NEXT: movzbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpsllq $40, %xmm0, %xmm0			; AVX1-NEXT: vpsllq $40, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: load8_ins_eltc_v32i8:			; AVX2-LABEL: load8_ins_eltc_v32i8:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: movzbl (%rdi), %eax			; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpsllq $40, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%x = load i8, i8* %p			%x = load i8, i8* %p
	%ins = insertelement <32 x i8> undef, i8 %x, i32 21			%ins = insertelement <32 x i8> undef, i8 %x, i32 21
	ret <32 x i8> %ins			ret <32 x i8> %ins
	}			}

	define <16 x i16> @load16_ins_eltc_v16i16(i16* %p) nounwind {			define <16 x i16> @load16_ins_eltc_v16i16(i16* %p) nounwind {
	; SSE-LABEL: load16_ins_eltc_v16i16:			; SSE-LABEL: load16_ins_eltc_v16i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movzwl (%rdi), %eax			; SSE-NEXT: movzwl (%rdi), %eax
	; SSE-NEXT: movd %eax, %xmm1			; SSE-NEXT: movd %eax, %xmm1
	; SSE-NEXT: psllq $48, %xmm1			; SSE-NEXT: psllq $48, %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: load16_ins_eltc_v16i16:			; AVX1-LABEL: load16_ins_eltc_v16i16:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: movzwl (%rdi), %eax			; AVX1-NEXT: movzwl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0			; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0			; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: load16_ins_eltc_v16i16:			; AVX2-LABEL: load16_ins_eltc_v16i16:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: movzwl (%rdi), %eax			; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%x = load i16, i16* %p			%x = load i16, i16* %p
	%ins = insertelement <16 x i16> undef, i16 %x, i32 11			%ins = insertelement <16 x i16> undef, i16 %x, i32 11
	ret <16 x i16> %ins			ret <16 x i16> %ins
	}			}

	define <8 x i32> @load32_ins_eltc_v8i32(i32* %p) nounwind {			define <8 x i32> @load32_ins_eltc_v8i32(i32* %p) nounwind {
	; SSE-LABEL: load32_ins_eltc_v8i32:			; SSE-LABEL: load32_ins_eltc_v8i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]			; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: load32_ins_eltc_v8i32:			; AVX-LABEL: load32_ins_eltc_v8i32:
	; AVX1: # %bb.0:			; AVX: # %bb.0:
	; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; AVX-NEXT: vbroadcastss (%rdi), %ymm0
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]			; AVX-NEXT: retq
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: load32_ins_eltc_v8i32:
	; AVX2: # %bb.0:
	; AVX2-NEXT: vbroadcastss (%rdi), %xmm0
	; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq
	%x = load i32, i32* %p			%x = load i32, i32* %p
	%ins = insertelement <8 x i32> undef, i32 %x, i32 7			%ins = insertelement <8 x i32> undef, i32 %x, i32 7
	ret <8 x i32> %ins			ret <8 x i32> %ins
	}			}

	define <4 x i64> @load64_ins_eltc_v4i64(i64* %p) nounwind {			define <4 x i64> @load64_ins_eltc_v4i64(i64* %p) nounwind {
	; SSE-LABEL: load64_ins_eltc_v4i64:			; SSE-LABEL: load64_ins_eltc_v4i64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero			; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]			; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: load64_ins_eltc_v4i64:			; AVX-LABEL: load64_ins_eltc_v4i64:
	; AVX1: # %bb.0:			; AVX: # %bb.0:
	; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero			; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]			; AVX-NEXT: retq
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: load64_ins_eltc_v4i64:
	; AVX2: # %bb.0:
	; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
	; AVX2-NEXT: retq
	%x = load i64, i64* %p			%x = load i64, i64* %p
	%ins = insertelement <4 x i64> undef, i64 %x, i32 3			%ins = insertelement <4 x i64> undef, i64 %x, i32 3
	ret <4 x i64> %ins			ret <4 x i64> %ins
	}			}

	define <8 x float> @load32_ins_eltc_v8f32(float* %p) nounwind {			define <8 x float> @load32_ins_eltc_v8f32(float* %p) nounwind {
	; SSE-LABEL: load32_ins_eltc_v8f32:			; SSE-LABEL: load32_ins_eltc_v8f32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]			; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: load32_ins_eltc_v8f32:			; AVX-LABEL: load32_ins_eltc_v8f32:
	; AVX1: # %bb.0:			; AVX: # %bb.0:
	; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; AVX-NEXT: vbroadcastss (%rdi), %ymm0
	; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]			; AVX-NEXT: retq
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: load32_ins_eltc_v8f32:
	; AVX2: # %bb.0:
	; AVX2-NEXT: vbroadcastss (%rdi), %xmm0
	; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq
	%x = load float, float* %p			%x = load float, float* %p
	%ins = insertelement <8 x float> undef, float %x, i32 5			%ins = insertelement <8 x float> undef, float %x, i32 5
	ret <8 x float> %ins			ret <8 x float> %ins
	}			}

	define <4 x double> @load64_ins_eltc_v4f64(double* %p) nounwind {			define <4 x double> @load64_ins_eltc_v4f64(double* %p) nounwind {
	; SSE-LABEL: load64_ins_eltc_v4f64:			; SSE-LABEL: load64_ins_eltc_v4f64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0]			; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: load64_ins_eltc_v4f64:			; AVX-LABEL: load64_ins_eltc_v4f64:
	; AVX1: # %bb.0:			; AVX: # %bb.0:
	; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]			; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX-NEXT: retq
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: load64_ins_eltc_v4f64:
	; AVX2: # %bb.0:
	; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
	; AVX2-NEXT: retq
	%x = load double, double* %p			%x = load double, double* %p
	%ins = insertelement <4 x double> undef, double %x, i32 3			%ins = insertelement <4 x double> undef, double %x, i32 3
	ret <4 x double> %ins			ret <4 x double> %ins
	}			}

llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll

	Show First 20 Lines • Show All 268 Lines • ▼ Show 20 Lines
	; SSE-LABEL: test11:			; SSE-LABEL: test11:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]			; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
	; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]			; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
	; SSE-NEXT: subss %xmm1, %xmm0			; SSE-NEXT: subss %xmm1, %xmm0
	; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]			; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: test11:			; AVX1-LABEL: test11:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]			; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
	; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]			; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
	; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0			; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]			; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
	; AVX-NEXT: retq			; AVX1-NEXT: retq
				;
				; AVX512-LABEL: test11:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
				; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
				; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
				; AVX512-NEXT: retq
	%1 = extractelement <4 x float> %A, i32 2			%1 = extractelement <4 x float> %A, i32 2
	%2 = extractelement <4 x float> %B, i32 2			%2 = extractelement <4 x float> %B, i32 2
	%sub = fsub float %1, %2			%sub = fsub float %1, %2
	%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2			%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
	ret <4 x float> %vecinsert1			ret <4 x float> %vecinsert1
	}			}

	define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {			define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
	▲ Show 20 Lines • Show All 402 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/sse41.ll

	Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Lines
	; X86-AVX1-NEXT: vpmovzxbq (%eax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]			; X86-AVX1-NEXT: vpmovzxbq (%eax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
	; X86-AVX1-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X86-AVX1-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X86-AVX1-NEXT: retl ## encoding: [0xc3]			; X86-AVX1-NEXT: retl ## encoding: [0xc3]
	;			;
	; X86-AVX512-LABEL: pmovzxbq_1:			; X86-AVX512-LABEL: pmovzxbq_1:
	; X86-AVX512: ## %bb.0: ## %entry			; X86-AVX512: ## %bb.0: ## %entry
	; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]			; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
	; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4			; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
	; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]			; X86-AVX512-NEXT: vpbroadcastw (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00]
	; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X86-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0]
				; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; X86-AVX512-NEXT: retl ## encoding: [0xc3]			; X86-AVX512-NEXT: retl ## encoding: [0xc3]
	;			;
	; X64-SSE-LABEL: pmovzxbq_1:			; X64-SSE-LABEL: pmovzxbq_1:
	; X64-SSE: ## %bb.0: ## %entry			; X64-SSE: ## %bb.0: ## %entry
	; X64-SSE-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]			; X64-SSE-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
	; X64-SSE-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load			; X64-SSE-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	; X64-SSE-NEXT: pmovzxbq (%rax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00]			; X64-SSE-NEXT: pmovzxbq (%rax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00]
	; X64-SSE-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-SSE-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X64-SSE-NEXT: retq ## encoding: [0xc3]			; X64-SSE-NEXT: retq ## encoding: [0xc3]
	;			;
	; X64-AVX1-LABEL: pmovzxbq_1:			; X64-AVX1-LABEL: pmovzxbq_1:
	; X64-AVX1: ## %bb.0: ## %entry			; X64-AVX1: ## %bb.0: ## %entry
	; X64-AVX1-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]			; X64-AVX1-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
	; X64-AVX1-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load			; X64-AVX1-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	; X64-AVX1-NEXT: vpmovzxbq (%rax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]			; X64-AVX1-NEXT: vpmovzxbq (%rax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
	; X64-AVX1-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-AVX1-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
	; X64-AVX1-NEXT: retq ## encoding: [0xc3]			; X64-AVX1-NEXT: retq ## encoding: [0xc3]
	;			;
	; X64-AVX512-LABEL: pmovzxbq_1:			; X64-AVX512-LABEL: pmovzxbq_1:
	; X64-AVX512: ## %bb.0: ## %entry			; X64-AVX512: ## %bb.0: ## %entry
	; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]			; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
	; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load			; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
	; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]			; X64-AVX512-NEXT: vpbroadcastw (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00]
	; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero			; X64-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0]
				; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; X64-AVX512-NEXT: retq ## encoding: [0xc3]			; X64-AVX512-NEXT: retq ## encoding: [0xc3]
	entry:			entry:
	%0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1]			%0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1]
	%1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]			%1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
	%2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]			%2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]			%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
	ret <2 x i64> %3			ret <2 x i64> %3
	}			}
	▲ Show 20 Lines • Show All 2,047 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll

	Show First 20 Lines • Show All 2,613 Lines • ▼ Show 20 Lines
	; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32:			; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]			; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32:			; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero			; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]			; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
				; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32:			; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; AVX2OR512VL: # %bb.0:			; AVX2OR512VL: # %bb.0:
	; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0			; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq			; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4			%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1			%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
	▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll

Show First 20 Lines • Show All 4,591 Lines • ▼ Show 20 Lines	; AVX2OR512VL-NEXT: retq
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>		%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>		%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <16 x i16> %tmp3		ret <16 x i16> %tmp3
}		}

define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {		define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:		; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero		; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]		; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
		; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32:		; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32:
; AVX2OR512VL: # %bb.0:		; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0		; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq		; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4		%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1		%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>		%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>		%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <16 x i16> %tmp3		ret <16 x i16> %tmp3
}		}