This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX512] Support variable-index vector insertion on AVX512 targets
ClosedPublic

Authored by RKSimon on Feb 1 2021, 4:32 AM.

Download Raw Diff

Details

Reviewers

craig.topper
pengfei
spatel

Commits

rGd46a6b3d55e6: [X86][AVX512] Support variable-index vector insertion on AVX512 targets…

Summary

With predicate masks, AVX512 can efficiently perform variable-index vector insertion with 2 broadcasts + 1 comparison, avoiding a lot of aliased memory traffic.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

RKSimon created this revision.Feb 1 2021, 4:32 AM

Herald added subscribers: arphaman, hiraditya. · View Herald TranscriptFeb 1 2021, 4:32 AM

RKSimon requested review of this revision.Feb 1 2021, 4:32 AM

Herald added a project: Restricted Project. · View Herald TranscriptFeb 1 2021, 4:32 AM

Harbormaster completed remote builds in B87351: Diff 320437.Feb 1 2021, 5:12 AM

spatel added inline comments.Feb 1 2021, 5:13 AM

llvm/test/CodeGen/X86/insertelement-var-index.ll
3–5	Worth adding a shared prefix for "AVX1or2", so we don't get so much duplication?

Add AVX1OR2 check prefix

Harbormaster completed remote builds in B87371: Diff 320474.Feb 1 2021, 9:35 AM

LGTM - see inline for a couple of minors.

llvm/lib/Target/X86/X86ISelLowering.cpp
18829–18832	Could use DAG.getSplatBuildVector() for both of these for slightly less code.
18839	We should have a code comment to describe the pattern: // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0

This revision is now accepted and ready to land.Feb 1 2021, 1:37 PM

This revision was landed with ongoing or failed builds.Feb 2 2021, 3:46 AM

Closed by commit rGd46a6b3d55e6: [X86][AVX512] Support variable-index vector insertion on AVX512 targets… (authored by RKSimon). · Explain Why

This revision was automatically updated to reflect the committed changes.

RKSimon added a commit: rGd46a6b3d55e6: [X86][AVX512] Support variable-index vector insertion on AVX512 targets….

RKSimon mentioned this in D95866: [X86][SSE] Support variable-index float/double vector insertion on SSE41+ targets (PR47924).Feb 2 2021, 6:12 AM

RKSimon mentioned this in rG32b7c2fa42a2: [X86][SSE] Support variable-index float/double vector insertion on SSE41+….Feb 3 2021, 6:15 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

28 lines

test/

CodeGen/

X86/

avx512-insert-extract.ll

38 lines

insertelement-var-index.ll

832 lines

Diff 320736

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 18,812 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

if (EltVT == MVT::i1)		if (EltVT == MVT::i1)
return InsertBitToMaskVector(Op, DAG, Subtarget);		return InsertBitToMaskVector(Op, DAG, Subtarget);

SDLoc dl(Op);		SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);		SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);		SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);		SDValue N2 = Op.getOperand(2);

auto *N2C = dyn_cast<ConstantSDNode>(N2);		auto *N2C = dyn_cast<ConstantSDNode>(N2);
if (!N2C \|\| N2C->getAPIntValue().uge(NumElts))
		if (!N2C) {
		// Variable insertion indices, usually we're better off spilling to stack,
		// but AVX512 can use a variable compare+select by comparing against all
		// possible vector indices.
		if (!(Subtarget.hasBWI() \|\|
		(Subtarget.hasAVX512() && EltVT.getScalarSizeInBits() >= 32)))
		return SDValue();

		MVT IdxSVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
		MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
		spatelUnsubmitted Not Done Reply Inline Actions Could use DAG.getSplatBuildVector() for both of these for slightly less code. spatel: Could use DAG.getSplatBuildVector() for both of these for slightly less code.
		SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
		SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
		SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

		SmallVector<SDValue, 16> RawIndices;
		for (unsigned I = 0; I != NumElts; ++I)
		RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
		spatelUnsubmitted Not Done Reply Inline Actions We should have a code comment to describe the pattern: // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0 spatel: We should have a code comment to describe the pattern: // inselt N0, N1, N2 --> select (SplatN2…
		SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

		// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
		return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
		ISD::CondCode::SETEQ);
		}

		if (N2C->getAPIntValue().uge(NumElts))
return SDValue();		return SDValue();
uint64_t IdxVal = N2C->getZExtValue();		uint64_t IdxVal = N2C->getZExtValue();

bool IsZeroElt = X86::isZeroNode(N1);		bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);		bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

// If we are inserting a element, see if we can do this more efficiently with		// If we are inserting a element, see if we can do this more efficiently with
// a blend shuffle with a rematerializable vector than a costly integer		// a blend shuffle with a rematerializable vector than a costly integer
▲ Show 20 Lines • Show All 32,925 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/avx512-insert-extract.ll

	Show First 20 Lines • Show All 1,647 Lines • ▼ Show 20 Lines
	; KNL-NEXT: orl %ecx, %eax			; KNL-NEXT: orl %ecx, %eax
	; KNL-NEXT: movq %rbp, %rsp			; KNL-NEXT: movq %rbp, %rsp
	; KNL-NEXT: popq %rbp			; KNL-NEXT: popq %rbp
	; KNL-NEXT: vzeroupper			; KNL-NEXT: vzeroupper
	; KNL-NEXT: retq			; KNL-NEXT: retq
	;			;
	; SKX-LABEL: test_insertelement_variable_v32i1:			; SKX-LABEL: test_insertelement_variable_v32i1:
	; SKX: ## %bb.0:			; SKX: ## %bb.0:
	; SKX-NEXT: pushq %rbp
	; SKX-NEXT: .cfi_def_cfa_offset 16
	; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-32, %rsp
	; SKX-NEXT: subq $64, %rsp
	; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
	; SKX-NEXT: vptestmb %ymm0, %ymm0, %k0			; SKX-NEXT: vptestmb %ymm0, %ymm0, %k0
	; SKX-NEXT: andl $31, %esi
	; SKX-NEXT: testb %dil, %dil			; SKX-NEXT: testb %dil, %dil
				; SKX-NEXT: setne %al
				; SKX-NEXT: vpbroadcastb %esi, %ymm0
				; SKX-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %k1
	; SKX-NEXT: vpmovm2b %k0, %ymm0			; SKX-NEXT: vpmovm2b %k0, %ymm0
	; SKX-NEXT: vmovdqa %ymm0, (%rsp)			; SKX-NEXT: vpbroadcastb %eax, %ymm0 {%k1}
	; SKX-NEXT: setne (%rsp,%rsi)			; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
	; SKX-NEXT: vpsllw $7, (%rsp), %ymm0
	; SKX-NEXT: vpmovb2m %ymm0, %k0			; SKX-NEXT: vpmovb2m %ymm0, %k0
	; SKX-NEXT: kmovd %k0, %eax			; SKX-NEXT: kmovd %k0, %eax
	; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t1 = icmp ugt <32 x i8> %a, zeroinitializer			%t1 = icmp ugt <32 x i8> %a, zeroinitializer
	%t2 = icmp ugt i8 %b, 0			%t2 = icmp ugt i8 %b, 0
	%t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index			%t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
	%t4 = bitcast <32 x i1> %t3 to i32			%t4 = bitcast <32 x i1> %t3 to i32
	ret i32 %t4			ret i32 %t4
	}			}
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; KNL-NEXT: orq %rcx, %rax			; KNL-NEXT: orq %rcx, %rax
	; KNL-NEXT: movq %rbp, %rsp			; KNL-NEXT: movq %rbp, %rsp
	; KNL-NEXT: popq %rbp			; KNL-NEXT: popq %rbp
	; KNL-NEXT: vzeroupper			; KNL-NEXT: vzeroupper
	; KNL-NEXT: retq			; KNL-NEXT: retq
	;			;
	; SKX-LABEL: test_insertelement_variable_v64i1:			; SKX-LABEL: test_insertelement_variable_v64i1:
	; SKX: ## %bb.0:			; SKX: ## %bb.0:
	; SKX-NEXT: pushq %rbp
	; SKX-NEXT: .cfi_def_cfa_offset 16
	; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-64, %rsp
	; SKX-NEXT: subq $128, %rsp
	; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
	; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0			; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
	; SKX-NEXT: andl $63, %esi
	; SKX-NEXT: testb %dil, %dil			; SKX-NEXT: testb %dil, %dil
				; SKX-NEXT: setne %al
				; SKX-NEXT: vpbroadcastb %esi, %zmm0
				; SKX-NEXT: vpcmpeqb {{.*}}(%rip), %zmm0, %k1
	; SKX-NEXT: vpmovm2b %k0, %zmm0			; SKX-NEXT: vpmovm2b %k0, %zmm0
	; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)			; SKX-NEXT: vpbroadcastb %eax, %zmm0 {%k1}
	; SKX-NEXT: setne (%rsp,%rsi)			; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
	; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
	; SKX-NEXT: vpmovb2m %zmm0, %k0			; SKX-NEXT: vpmovb2m %zmm0, %k0
	; SKX-NEXT: kmovq %k0, %rax			; SKX-NEXT: kmovq %k0, %rax
	; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t1 = icmp ugt <64 x i8> %a, zeroinitializer			%t1 = icmp ugt <64 x i8> %a, zeroinitializer
	%t2 = icmp ugt i8 %b, 0			%t2 = icmp ugt i8 %b, 0
	%t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index			%t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
	%t4 = bitcast <64 x i1> %t3 to i64			%t4 = bitcast <64 x i1> %t3 to i64
	ret i64 %t4			ret i64 %t4
	}			}
	▲ Show 20 Lines • Show All 527 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/insertelement-var-index.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 \| FileCheck %s --check-prefixes=ALL,SSE,SSE2			; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 \| FileCheck %s --check-prefixes=ALL,SSE,SSE2
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 \| FileCheck %s --check-prefixes=ALL,SSE,SSE41			; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 \| FileCheck %s --check-prefixes=ALL,SSE,SSE41
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx \| FileCheck %s --check-prefixes=ALL,AVX,AVX1			; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx \| FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX1
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 \| FileCheck %s --check-prefixes=ALL,AVX,AVX2			; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 \| FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2
				spatelUnsubmitted Not Done Reply Inline Actions Worth adding a shared prefix for "AVX1or2", so we don't get so much duplication? spatel: Worth adding a shared prefix for "AVX1or2", so we don't get so much duplication?
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl \| FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F			; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl \| FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
	; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl \| FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW			; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl \| FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW

	define <16 x i8> @undef_index(i8 %x) nounwind {			define <16 x i8> @undef_index(i8 %x) nounwind {
	; ALL-LABEL: undef_index:			; ALL-LABEL: undef_index:
	; ALL: # %bb.0:			; ALL: # %bb.0:
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%ins = insertelement <16 x i8> undef, i8 %x, i64 undef			%ins = insertelement <16 x i8> undef, i8 %x, i64 undef
	▲ Show 20 Lines • Show All 680 Lines • ▼ Show 20 Lines
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $15, %esi			; SSE-NEXT: andl $15, %esi
	; SSE-NEXT: movb %dil, -24(%rsp,%rsi)			; SSE-NEXT: movb %dil, -24(%rsp,%rsi)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i8_v16i8:			; AVX1OR2-LABEL: arg_i8_v16i8:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $15, %esi			; AVX1OR2-NEXT: andl $15, %esi
	; AVX-NEXT: movb %dil, -24(%rsp,%rsi)			; AVX1OR2-NEXT: movb %dil, -24(%rsp,%rsi)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: arg_i8_v16i8:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
				; AVX512F-NEXT: andl $15, %esi
				; AVX512F-NEXT: movb %dil, -24(%rsp,%rsi)
				; AVX512F-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: arg_i8_v16i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastb %esi, %xmm1
				; AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %k1
				; AVX512BW-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
				; AVX512BW-NEXT: retq
	%ins = insertelement <16 x i8> %v, i8 %x, i32 %y			%ins = insertelement <16 x i8> %v, i8 %x, i32 %y
	ret <16 x i8> %ins			ret <16 x i8> %ins
	}			}

	define <8 x i16> @arg_i16_v8i16(<8 x i16> %v, i16 %x, i32 %y) nounwind {			define <8 x i16> @arg_i16_v8i16(<8 x i16> %v, i16 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i16_v8i16:			; SSE-LABEL: arg_i16_v8i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $7, %esi			; SSE-NEXT: andl $7, %esi
	; SSE-NEXT: movw %di, -24(%rsp,%rsi,2)			; SSE-NEXT: movw %di, -24(%rsp,%rsi,2)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i16_v8i16:			; AVX1OR2-LABEL: arg_i16_v8i16:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $7, %esi			; AVX1OR2-NEXT: andl $7, %esi
	; AVX-NEXT: movw %di, -24(%rsp,%rsi,2)			; AVX1OR2-NEXT: movw %di, -24(%rsp,%rsi,2)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: arg_i16_v8i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
				; AVX512F-NEXT: andl $7, %esi
				; AVX512F-NEXT: movw %di, -24(%rsp,%rsi,2)
				; AVX512F-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: arg_i16_v8i16:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastw %esi, %xmm1
				; AVX512BW-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %k1
				; AVX512BW-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
				; AVX512BW-NEXT: retq
	%ins = insertelement <8 x i16> %v, i16 %x, i32 %y			%ins = insertelement <8 x i16> %v, i16 %x, i32 %y
	ret <8 x i16> %ins			ret <8 x i16> %ins
	}			}

	define <4 x i32> @arg_i32_v4i32(<4 x i32> %v, i32 %x, i32 %y) nounwind {			define <4 x i32> @arg_i32_v4i32(<4 x i32> %v, i32 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i32_v4i32:			; SSE-LABEL: arg_i32_v4i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %esi			; SSE-NEXT: andl $3, %esi
	; SSE-NEXT: movl %edi, -24(%rsp,%rsi,4)			; SSE-NEXT: movl %edi, -24(%rsp,%rsi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i32_v4i32:			; AVX1OR2-LABEL: arg_i32_v4i32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $3, %esi			; AVX1OR2-NEXT: andl $3, %esi
	; AVX-NEXT: movl %edi, -24(%rsp,%rsi,4)			; AVX1OR2-NEXT: movl %edi, -24(%rsp,%rsi,4)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_i32_v4i32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %esi, %xmm1
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %xmm1, %k1
				; AVX512-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <4 x i32> %v, i32 %x, i32 %y			%ins = insertelement <4 x i32> %v, i32 %x, i32 %y
	ret <4 x i32> %ins			ret <4 x i32> %ins
	}			}

	define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {			define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i64_v2i64:			; SSE-LABEL: arg_i64_v2i64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $1, %esi			; SSE-NEXT: andl $1, %esi
	; SSE-NEXT: movq %rdi, -24(%rsp,%rsi,8)			; SSE-NEXT: movq %rdi, -24(%rsp,%rsi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i64_v2i64:			; AVX1OR2-LABEL: arg_i64_v2i64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $1, %esi			; AVX1OR2-NEXT: andl $1, %esi
	; AVX-NEXT: movq %rdi, -24(%rsp,%rsi,8)			; AVX1OR2-NEXT: movq %rdi, -24(%rsp,%rsi,8)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_i64_v2i64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %esi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %xmm1
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %xmm1, %k1
				; AVX512-NEXT: vpbroadcastq %rdi, %xmm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <2 x i64> %v, i64 %x, i32 %y			%ins = insertelement <2 x i64> %v, i64 %x, i32 %y
	ret <2 x i64> %ins			ret <2 x i64> %ins
	}			}

	define <4 x float> @arg_f32_v4f32(<4 x float> %v, float %x, i32 %y) nounwind {			define <4 x float> @arg_f32_v4f32(<4 x float> %v, float %x, i32 %y) nounwind {
	; SSE-LABEL: arg_f32_v4f32:			; SSE-LABEL: arg_f32_v4f32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $edi killed $edi def $rdi			; SSE-NEXT: # kill: def $edi killed $edi def $rdi
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %edi			; SSE-NEXT: andl $3, %edi
	; SSE-NEXT: movss %xmm1, -24(%rsp,%rdi,4)			; SSE-NEXT: movss %xmm1, -24(%rsp,%rdi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_f32_v4f32:			; AVX1OR2-LABEL: arg_f32_v4f32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $edi killed $edi def $rdi			; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $3, %edi			; AVX1OR2-NEXT: andl $3, %edi
	; AVX-NEXT: vmovss %xmm1, -24(%rsp,%rdi,4)			; AVX1OR2-NEXT: vmovss %xmm1, -24(%rsp,%rdi,4)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_f32_v4f32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %edi, %xmm2
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %xmm2, %k1
				; AVX512-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <4 x float> %v, float %x, i32 %y			%ins = insertelement <4 x float> %v, float %x, i32 %y
	ret <4 x float> %ins			ret <4 x float> %ins
	}			}

	define <2 x double> @arg_f64_v2f64(<2 x double> %v, double %x, i32 %y) nounwind {			define <2 x double> @arg_f64_v2f64(<2 x double> %v, double %x, i32 %y) nounwind {
	; SSE-LABEL: arg_f64_v2f64:			; SSE-LABEL: arg_f64_v2f64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $edi killed $edi def $rdi			; SSE-NEXT: # kill: def $edi killed $edi def $rdi
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $1, %edi			; SSE-NEXT: andl $1, %edi
	; SSE-NEXT: movsd %xmm1, -24(%rsp,%rdi,8)			; SSE-NEXT: movsd %xmm1, -24(%rsp,%rdi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_f64_v2f64:			; AVX1OR2-LABEL: arg_f64_v2f64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $edi killed $edi def $rdi			; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $1, %edi			; AVX1OR2-NEXT: andl $1, %edi
	; AVX-NEXT: vmovsd %xmm1, -24(%rsp,%rdi,8)			; AVX1OR2-NEXT: vmovsd %xmm1, -24(%rsp,%rdi,8)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_f64_v2f64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %edi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %xmm2
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %xmm2, %k1
				; AVX512-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
				; AVX512-NEXT: retq
	%ins = insertelement <2 x double> %v, double %x, i32 %y			%ins = insertelement <2 x double> %v, double %x, i32 %y
	ret <2 x double> %ins			ret <2 x double> %ins
	}			}

	define <16 x i8> @load_i8_v16i8(<16 x i8> %v, i8* %p, i32 %y) nounwind {			define <16 x i8> @load_i8_v16i8(<16 x i8> %v, i8* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i8_v16i8:			; SSE-LABEL: load_i8_v16i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movb (%rdi), %al			; SSE-NEXT: movb (%rdi), %al
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $15, %esi			; SSE-NEXT: andl $15, %esi
	; SSE-NEXT: movb %al, -24(%rsp,%rsi)			; SSE-NEXT: movb %al, -24(%rsp,%rsi)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i8_v16i8:			; AVX1OR2-LABEL: load_i8_v16i8:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movb (%rdi), %al			; AVX1OR2-NEXT: movb (%rdi), %al
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $15, %esi			; AVX1OR2-NEXT: andl $15, %esi
	; AVX-NEXT: movb %al, -24(%rsp,%rsi)			; AVX1OR2-NEXT: movb %al, -24(%rsp,%rsi)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: load_i8_v16i8:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: movb (%rdi), %al
				; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
				; AVX512F-NEXT: andl $15, %esi
				; AVX512F-NEXT: movb %al, -24(%rsp,%rsi)
				; AVX512F-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: load_i8_v16i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastb %esi, %xmm1
				; AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %k1
				; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
				; AVX512BW-NEXT: retq
	%x = load i8, i8* %p			%x = load i8, i8* %p
	%ins = insertelement <16 x i8> %v, i8 %x, i32 %y			%ins = insertelement <16 x i8> %v, i8 %x, i32 %y
	ret <16 x i8> %ins			ret <16 x i8> %ins
	}			}

	define <8 x i16> @load_i16_v8i16(<8 x i16> %v, i16* %p, i32 %y) nounwind {			define <8 x i16> @load_i16_v8i16(<8 x i16> %v, i16* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i16_v8i16:			; SSE-LABEL: load_i16_v8i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movzwl (%rdi), %eax			; SSE-NEXT: movzwl (%rdi), %eax
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $7, %esi			; SSE-NEXT: andl $7, %esi
	; SSE-NEXT: movw %ax, -24(%rsp,%rsi,2)			; SSE-NEXT: movw %ax, -24(%rsp,%rsi,2)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i16_v8i16:			; AVX1OR2-LABEL: load_i16_v8i16:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movzwl (%rdi), %eax			; AVX1OR2-NEXT: movzwl (%rdi), %eax
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $7, %esi			; AVX1OR2-NEXT: andl $7, %esi
	; AVX-NEXT: movw %ax, -24(%rsp,%rsi,2)			; AVX1OR2-NEXT: movw %ax, -24(%rsp,%rsi,2)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: load_i16_v8i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: movzwl (%rdi), %eax
				; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
				; AVX512F-NEXT: andl $7, %esi
				; AVX512F-NEXT: movw %ax, -24(%rsp,%rsi,2)
				; AVX512F-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: load_i16_v8i16:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastw %esi, %xmm1
				; AVX512BW-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %k1
				; AVX512BW-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
				; AVX512BW-NEXT: retq
	%x = load i16, i16* %p			%x = load i16, i16* %p
	%ins = insertelement <8 x i16> %v, i16 %x, i32 %y			%ins = insertelement <8 x i16> %v, i16 %x, i32 %y
	ret <8 x i16> %ins			ret <8 x i16> %ins
	}			}

	define <4 x i32> @load_i32_v4i32(<4 x i32> %v, i32* %p, i32 %y) nounwind {			define <4 x i32> @load_i32_v4i32(<4 x i32> %v, i32* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i32_v4i32:			; SSE-LABEL: load_i32_v4i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movl (%rdi), %eax			; SSE-NEXT: movl (%rdi), %eax
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %esi			; SSE-NEXT: andl $3, %esi
	; SSE-NEXT: movl %eax, -24(%rsp,%rsi,4)			; SSE-NEXT: movl %eax, -24(%rsp,%rsi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i32_v4i32:			; AVX1OR2-LABEL: load_i32_v4i32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movl (%rdi), %eax			; AVX1OR2-NEXT: movl (%rdi), %eax
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $3, %esi			; AVX1OR2-NEXT: andl $3, %esi
	; AVX-NEXT: movl %eax, -24(%rsp,%rsi,4)			; AVX1OR2-NEXT: movl %eax, -24(%rsp,%rsi,4)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_i32_v4i32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %esi, %xmm1
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %xmm1, %k1
				; AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
				; AVX512-NEXT: retq
	%x = load i32, i32* %p			%x = load i32, i32* %p
	%ins = insertelement <4 x i32> %v, i32 %x, i32 %y			%ins = insertelement <4 x i32> %v, i32 %x, i32 %y
	ret <4 x i32> %ins			ret <4 x i32> %ins
	}			}

	define <2 x i64> @load_i64_v2i64(<2 x i64> %v, i64* %p, i32 %y) nounwind {			define <2 x i64> @load_i64_v2i64(<2 x i64> %v, i64* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i64_v2i64:			; SSE-LABEL: load_i64_v2i64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movq (%rdi), %rax			; SSE-NEXT: movq (%rdi), %rax
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $1, %esi			; SSE-NEXT: andl $1, %esi
	; SSE-NEXT: movq %rax, -24(%rsp,%rsi,8)			; SSE-NEXT: movq %rax, -24(%rsp,%rsi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i64_v2i64:			; AVX1OR2-LABEL: load_i64_v2i64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movq (%rdi), %rax			; AVX1OR2-NEXT: movq (%rdi), %rax
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $1, %esi			; AVX1OR2-NEXT: andl $1, %esi
	; AVX-NEXT: movq %rax, -24(%rsp,%rsi,8)			; AVX1OR2-NEXT: movq %rax, -24(%rsp,%rsi,8)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_i64_v2i64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %esi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %xmm1
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %xmm1, %k1
				; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1}
				; AVX512-NEXT: retq
	%x = load i64, i64* %p			%x = load i64, i64* %p
	%ins = insertelement <2 x i64> %v, i64 %x, i32 %y			%ins = insertelement <2 x i64> %v, i64 %x, i32 %y
	ret <2 x i64> %ins			ret <2 x i64> %ins
	}			}

	define <4 x float> @load_f32_v4f32(<4 x float> %v, float* %p, i32 %y) nounwind {			define <4 x float> @load_f32_v4f32(<4 x float> %v, float* %p, i32 %y) nounwind {
	; SSE-LABEL: load_f32_v4f32:			; SSE-LABEL: load_f32_v4f32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %esi			; SSE-NEXT: andl $3, %esi
	; SSE-NEXT: movss %xmm1, -24(%rsp,%rsi,4)			; SSE-NEXT: movss %xmm1, -24(%rsp,%rsi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_f32_v4f32:			; AVX1OR2-LABEL: load_f32_v4f32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $3, %esi			; AVX1OR2-NEXT: andl $3, %esi
	; AVX-NEXT: vmovss %xmm1, -24(%rsp,%rsi,4)			; AVX1OR2-NEXT: vmovss %xmm1, -24(%rsp,%rsi,4)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_f32_v4f32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %esi, %xmm1
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %xmm1, %k1
				; AVX512-NEXT: vbroadcastss (%rdi), %xmm0 {%k1}
				; AVX512-NEXT: retq
	%x = load float, float* %p			%x = load float, float* %p
	%ins = insertelement <4 x float> %v, float %x, i32 %y			%ins = insertelement <4 x float> %v, float %x, i32 %y
	ret <4 x float> %ins			ret <4 x float> %ins
	}			}

	define <2 x double> @load_f64_v2f64(<2 x double> %v, double* %p, i32 %y) nounwind {			define <2 x double> @load_f64_v2f64(<2 x double> %v, double* %p, i32 %y) nounwind {
	; SSE-LABEL: load_f64_v2f64:			; SSE-LABEL: load_f64_v2f64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero			; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $1, %esi			; SSE-NEXT: andl $1, %esi
	; SSE-NEXT: movsd %xmm1, -24(%rsp,%rsi,8)			; SSE-NEXT: movsd %xmm1, -24(%rsp,%rsi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_f64_v2f64:			; AVX1OR2-LABEL: load_f64_v2f64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero			; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; AVX-NEXT: andl $1, %esi			; AVX1OR2-NEXT: andl $1, %esi
	; AVX-NEXT: vmovsd %xmm1, -24(%rsp,%rsi,8)			; AVX1OR2-NEXT: vmovsd %xmm1, -24(%rsp,%rsi,8)
	; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0			; AVX1OR2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_f64_v2f64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %esi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %xmm1
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %xmm1, %k1
				; AVX512-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
				; AVX512-NEXT: retq
	%x = load double, double* %p			%x = load double, double* %p
	%ins = insertelement <2 x double> %v, double %x, i32 %y			%ins = insertelement <2 x double> %v, double %x, i32 %y
	ret <2 x double> %ins			ret <2 x double> %ins
	}			}

	define <32 x i8> @arg_i8_v32i8(<32 x i8> %v, i8 %x, i32 %y) nounwind {			define <32 x i8> @arg_i8_v32i8(<32 x i8> %v, i8 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i8_v32i8:			; SSE-LABEL: arg_i8_v32i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $31, %esi			; SSE-NEXT: andl $31, %esi
	; SSE-NEXT: movb %dil, -40(%rsp,%rsi)			; SSE-NEXT: movb %dil, -40(%rsp,%rsi)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i8_v32i8:			; AVX1OR2-LABEL: arg_i8_v32i8:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $31, %esi			; AVX1OR2-NEXT: andl $31, %esi
	; AVX-NEXT: movb %dil, (%rsp,%rsi)			; AVX1OR2-NEXT: movb %dil, (%rsp,%rsi)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: arg_i8_v32i8:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: pushq %rbp
				; AVX512F-NEXT: movq %rsp, %rbp
				; AVX512F-NEXT: andq $-32, %rsp
				; AVX512F-NEXT: subq $64, %rsp
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
				; AVX512F-NEXT: andl $31, %esi
				; AVX512F-NEXT: movb %dil, (%rsp,%rsi)
				; AVX512F-NEXT: vmovaps (%rsp), %ymm0
				; AVX512F-NEXT: movq %rbp, %rsp
				; AVX512F-NEXT: popq %rbp
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: arg_i8_v32i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastb %esi, %ymm1
				; AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %k1
				; AVX512BW-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
				; AVX512BW-NEXT: retq
	%ins = insertelement <32 x i8> %v, i8 %x, i32 %y			%ins = insertelement <32 x i8> %v, i8 %x, i32 %y
	ret <32 x i8> %ins			ret <32 x i8> %ins
	}			}

	define <16 x i16> @arg_i16_v16i16(<16 x i16> %v, i16 %x, i32 %y) nounwind {			define <16 x i16> @arg_i16_v16i16(<16 x i16> %v, i16 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i16_v16i16:			; SSE-LABEL: arg_i16_v16i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $15, %esi			; SSE-NEXT: andl $15, %esi
	; SSE-NEXT: movw %di, -40(%rsp,%rsi,2)			; SSE-NEXT: movw %di, -40(%rsp,%rsi,2)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i16_v16i16:			; AVX1OR2-LABEL: arg_i16_v16i16:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $15, %esi			; AVX1OR2-NEXT: andl $15, %esi
	; AVX-NEXT: movw %di, (%rsp,%rsi,2)			; AVX1OR2-NEXT: movw %di, (%rsp,%rsi,2)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: arg_i16_v16i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: pushq %rbp
				; AVX512F-NEXT: movq %rsp, %rbp
				; AVX512F-NEXT: andq $-32, %rsp
				; AVX512F-NEXT: subq $64, %rsp
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
				; AVX512F-NEXT: andl $15, %esi
				; AVX512F-NEXT: movw %di, (%rsp,%rsi,2)
				; AVX512F-NEXT: vmovaps (%rsp), %ymm0
				; AVX512F-NEXT: movq %rbp, %rsp
				; AVX512F-NEXT: popq %rbp
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: arg_i16_v16i16:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastw %esi, %ymm1
				; AVX512BW-NEXT: vpcmpeqw {{.*}}(%rip), %ymm1, %k1
				; AVX512BW-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
				; AVX512BW-NEXT: retq
	%ins = insertelement <16 x i16> %v, i16 %x, i32 %y			%ins = insertelement <16 x i16> %v, i16 %x, i32 %y
	ret <16 x i16> %ins			ret <16 x i16> %ins
	}			}

	define <8 x i32> @arg_i32_v8i32(<8 x i32> %v, i32 %x, i32 %y) nounwind {			define <8 x i32> @arg_i32_v8i32(<8 x i32> %v, i32 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i32_v8i32:			; SSE-LABEL: arg_i32_v8i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $7, %esi			; SSE-NEXT: andl $7, %esi
	; SSE-NEXT: movl %edi, -40(%rsp,%rsi,4)			; SSE-NEXT: movl %edi, -40(%rsp,%rsi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i32_v8i32:			; AVX1OR2-LABEL: arg_i32_v8i32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $7, %esi			; AVX1OR2-NEXT: andl $7, %esi
	; AVX-NEXT: movl %edi, (%rsp,%rsi,4)			; AVX1OR2-NEXT: movl %edi, (%rsp,%rsi,4)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_i32_v8i32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %esi, %ymm1
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %ymm1, %k1
				; AVX512-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <8 x i32> %v, i32 %x, i32 %y			%ins = insertelement <8 x i32> %v, i32 %x, i32 %y
	ret <8 x i32> %ins			ret <8 x i32> %ins
	}			}

	define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {			define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {
	; SSE-LABEL: arg_i64_v4i64:			; SSE-LABEL: arg_i64_v4i64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %esi			; SSE-NEXT: andl $3, %esi
	; SSE-NEXT: movq %rdi, -40(%rsp,%rsi,8)			; SSE-NEXT: movq %rdi, -40(%rsp,%rsi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_i64_v4i64:			; AVX1OR2-LABEL: arg_i64_v4i64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $3, %esi			; AVX1OR2-NEXT: andl $3, %esi
	; AVX-NEXT: movq %rdi, (%rsp,%rsi,8)			; AVX1OR2-NEXT: movq %rdi, (%rsp,%rsi,8)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_i64_v4i64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %esi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %ymm1
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %ymm1, %k1
				; AVX512-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <4 x i64> %v, i64 %x, i32 %y			%ins = insertelement <4 x i64> %v, i64 %x, i32 %y
	ret <4 x i64> %ins			ret <4 x i64> %ins
	}			}

	define <8 x float> @arg_f32_v8f32(<8 x float> %v, float %x, i32 %y) nounwind {			define <8 x float> @arg_f32_v8f32(<8 x float> %v, float %x, i32 %y) nounwind {
	; SSE-LABEL: arg_f32_v8f32:			; SSE-LABEL: arg_f32_v8f32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $edi killed $edi def $rdi			; SSE-NEXT: # kill: def $edi killed $edi def $rdi
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $7, %edi			; SSE-NEXT: andl $7, %edi
	; SSE-NEXT: movss %xmm2, -40(%rsp,%rdi,4)			; SSE-NEXT: movss %xmm2, -40(%rsp,%rdi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_f32_v8f32:			; AVX1OR2-LABEL: arg_f32_v8f32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $edi killed $edi def $rdi			; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $7, %edi			; AVX1OR2-NEXT: andl $7, %edi
	; AVX-NEXT: vmovss %xmm1, (%rsp,%rdi,4)			; AVX1OR2-NEXT: vmovss %xmm1, (%rsp,%rdi,4)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_f32_v8f32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %edi, %ymm2
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %ymm2, %k1
				; AVX512-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <8 x float> %v, float %x, i32 %y			%ins = insertelement <8 x float> %v, float %x, i32 %y
	ret <8 x float> %ins			ret <8 x float> %ins
	}			}

	define <4 x double> @arg_f64_v4f64(<4 x double> %v, double %x, i32 %y) nounwind {			define <4 x double> @arg_f64_v4f64(<4 x double> %v, double %x, i32 %y) nounwind {
	; SSE-LABEL: arg_f64_v4f64:			; SSE-LABEL: arg_f64_v4f64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $edi killed $edi def $rdi			; SSE-NEXT: # kill: def $edi killed $edi def $rdi
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %edi			; SSE-NEXT: andl $3, %edi
	; SSE-NEXT: movsd %xmm2, -40(%rsp,%rdi,8)			; SSE-NEXT: movsd %xmm2, -40(%rsp,%rdi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: arg_f64_v4f64:			; AVX1OR2-LABEL: arg_f64_v4f64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $edi killed $edi def $rdi			; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $3, %edi			; AVX1OR2-NEXT: andl $3, %edi
	; AVX-NEXT: vmovsd %xmm1, (%rsp,%rdi,8)			; AVX1OR2-NEXT: vmovsd %xmm1, (%rsp,%rdi,8)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: arg_f64_v4f64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %edi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %ymm2
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %ymm2, %k1
				; AVX512-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
				; AVX512-NEXT: retq
	%ins = insertelement <4 x double> %v, double %x, i32 %y			%ins = insertelement <4 x double> %v, double %x, i32 %y
	ret <4 x double> %ins			ret <4 x double> %ins
	}			}

	define <32 x i8> @load_i8_v32i8(<32 x i8> %v, i8* %p, i32 %y) nounwind {			define <32 x i8> @load_i8_v32i8(<32 x i8> %v, i8* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i8_v32i8:			; SSE-LABEL: load_i8_v32i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movb (%rdi), %al			; SSE-NEXT: movb (%rdi), %al
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $31, %esi			; SSE-NEXT: andl $31, %esi
	; SSE-NEXT: movb %al, -40(%rsp,%rsi)			; SSE-NEXT: movb %al, -40(%rsp,%rsi)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i8_v32i8:			; AVX1OR2-LABEL: load_i8_v32i8:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movb (%rdi), %al			; AVX1OR2-NEXT: movb (%rdi), %al
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $31, %esi			; AVX1OR2-NEXT: andl $31, %esi
	; AVX-NEXT: movb %al, (%rsp,%rsi)			; AVX1OR2-NEXT: movb %al, (%rsp,%rsi)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: load_i8_v32i8:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: pushq %rbp
				; AVX512F-NEXT: movq %rsp, %rbp
				; AVX512F-NEXT: andq $-32, %rsp
				; AVX512F-NEXT: subq $64, %rsp
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: movb (%rdi), %al
				; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
				; AVX512F-NEXT: andl $31, %esi
				; AVX512F-NEXT: movb %al, (%rsp,%rsi)
				; AVX512F-NEXT: vmovaps (%rsp), %ymm0
				; AVX512F-NEXT: movq %rbp, %rsp
				; AVX512F-NEXT: popq %rbp
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: load_i8_v32i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastb %esi, %ymm1
				; AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %k1
				; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
				; AVX512BW-NEXT: retq
	%x = load i8, i8* %p			%x = load i8, i8* %p
	%ins = insertelement <32 x i8> %v, i8 %x, i32 %y			%ins = insertelement <32 x i8> %v, i8 %x, i32 %y
	ret <32 x i8> %ins			ret <32 x i8> %ins
	}			}

	define <16 x i16> @load_i16_v16i16(<16 x i16> %v, i16* %p, i32 %y) nounwind {			define <16 x i16> @load_i16_v16i16(<16 x i16> %v, i16* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i16_v16i16:			; SSE-LABEL: load_i16_v16i16:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movzwl (%rdi), %eax			; SSE-NEXT: movzwl (%rdi), %eax
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $15, %esi			; SSE-NEXT: andl $15, %esi
	; SSE-NEXT: movw %ax, -40(%rsp,%rsi,2)			; SSE-NEXT: movw %ax, -40(%rsp,%rsi,2)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i16_v16i16:			; AVX1OR2-LABEL: load_i16_v16i16:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movzwl (%rdi), %eax			; AVX1OR2-NEXT: movzwl (%rdi), %eax
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $15, %esi			; AVX1OR2-NEXT: andl $15, %esi
	; AVX-NEXT: movw %ax, (%rsp,%rsi,2)			; AVX1OR2-NEXT: movw %ax, (%rsp,%rsi,2)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512F-LABEL: load_i16_v16i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: pushq %rbp
				; AVX512F-NEXT: movq %rsp, %rbp
				; AVX512F-NEXT: andq $-32, %rsp
				; AVX512F-NEXT: subq $64, %rsp
				; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi
				; AVX512F-NEXT: movzwl (%rdi), %eax
				; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
				; AVX512F-NEXT: andl $15, %esi
				; AVX512F-NEXT: movw %ax, (%rsp,%rsi,2)
				; AVX512F-NEXT: vmovaps (%rsp), %ymm0
				; AVX512F-NEXT: movq %rbp, %rsp
				; AVX512F-NEXT: popq %rbp
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: load_i16_v16i16:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpbroadcastw %esi, %ymm1
				; AVX512BW-NEXT: vpcmpeqw {{.*}}(%rip), %ymm1, %k1
				; AVX512BW-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
				; AVX512BW-NEXT: retq
	%x = load i16, i16* %p			%x = load i16, i16* %p
	%ins = insertelement <16 x i16> %v, i16 %x, i32 %y			%ins = insertelement <16 x i16> %v, i16 %x, i32 %y
	ret <16 x i16> %ins			ret <16 x i16> %ins
	}			}

	define <8 x i32> @load_i32_v8i32(<8 x i32> %v, i32* %p, i32 %y) nounwind {			define <8 x i32> @load_i32_v8i32(<8 x i32> %v, i32* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i32_v8i32:			; SSE-LABEL: load_i32_v8i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movl (%rdi), %eax			; SSE-NEXT: movl (%rdi), %eax
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $7, %esi			; SSE-NEXT: andl $7, %esi
	; SSE-NEXT: movl %eax, -40(%rsp,%rsi,4)			; SSE-NEXT: movl %eax, -40(%rsp,%rsi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i32_v8i32:			; AVX1OR2-LABEL: load_i32_v8i32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movl (%rdi), %eax			; AVX1OR2-NEXT: movl (%rdi), %eax
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $7, %esi			; AVX1OR2-NEXT: andl $7, %esi
	; AVX-NEXT: movl %eax, (%rsp,%rsi,4)			; AVX1OR2-NEXT: movl %eax, (%rsp,%rsi,4)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_i32_v8i32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %esi, %ymm1
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %ymm1, %k1
				; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
				; AVX512-NEXT: retq
	%x = load i32, i32* %p			%x = load i32, i32* %p
	%ins = insertelement <8 x i32> %v, i32 %x, i32 %y			%ins = insertelement <8 x i32> %v, i32 %x, i32 %y
	ret <8 x i32> %ins			ret <8 x i32> %ins
	}			}

	define <4 x i64> @load_i64_v4i64(<4 x i64> %v, i64* %p, i32 %y) nounwind {			define <4 x i64> @load_i64_v4i64(<4 x i64> %v, i64* %p, i32 %y) nounwind {
	; SSE-LABEL: load_i64_v4i64:			; SSE-LABEL: load_i64_v4i64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movq (%rdi), %rax			; SSE-NEXT: movq (%rdi), %rax
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %esi			; SSE-NEXT: andl $3, %esi
	; SSE-NEXT: movq %rax, -40(%rsp,%rsi,8)			; SSE-NEXT: movq %rax, -40(%rsp,%rsi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_i64_v4i64:			; AVX1OR2-LABEL: load_i64_v4i64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: movq (%rdi), %rax			; AVX1OR2-NEXT: movq (%rdi), %rax
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $3, %esi			; AVX1OR2-NEXT: andl $3, %esi
	; AVX-NEXT: movq %rax, (%rsp,%rsi,8)			; AVX1OR2-NEXT: movq %rax, (%rsp,%rsi,8)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_i64_v4i64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %esi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %ymm1
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %ymm1, %k1
				; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
				; AVX512-NEXT: retq
	%x = load i64, i64* %p			%x = load i64, i64* %p
	%ins = insertelement <4 x i64> %v, i64 %x, i32 %y			%ins = insertelement <4 x i64> %v, i64 %x, i32 %y
	ret <4 x i64> %ins			ret <4 x i64> %ins
	}			}

	define <8 x float> @load_f32_v8f32(<8 x float> %v, float* %p, i32 %y) nounwind {			define <8 x float> @load_f32_v8f32(<8 x float> %v, float* %p, i32 %y) nounwind {
	; SSE-LABEL: load_f32_v8f32:			; SSE-LABEL: load_f32_v8f32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero			; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $7, %esi			; SSE-NEXT: andl $7, %esi
	; SSE-NEXT: movss %xmm2, -40(%rsp,%rsi,4)			; SSE-NEXT: movss %xmm2, -40(%rsp,%rsi,4)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_f32_v8f32:			; AVX1OR2-LABEL: load_f32_v8f32:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $7, %esi			; AVX1OR2-NEXT: andl $7, %esi
	; AVX-NEXT: vmovss %xmm1, (%rsp,%rsi,4)			; AVX1OR2-NEXT: vmovss %xmm1, (%rsp,%rsi,4)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_f32_v8f32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpbroadcastd %esi, %ymm1
				; AVX512-NEXT: vpcmpeqd {{.*}}(%rip), %ymm1, %k1
				; AVX512-NEXT: vbroadcastss (%rdi), %ymm0 {%k1}
				; AVX512-NEXT: retq
	%x = load float, float* %p			%x = load float, float* %p
	%ins = insertelement <8 x float> %v, float %x, i32 %y			%ins = insertelement <8 x float> %v, float %x, i32 %y
	ret <8 x float> %ins			ret <8 x float> %ins
	}			}

	define <4 x double> @load_f64_v4f64(<4 x double> %v, double* %p, i32 %y) nounwind {			define <4 x double> @load_f64_v4f64(<4 x double> %v, double* %p, i32 %y) nounwind {
	; SSE-LABEL: load_f64_v4f64:			; SSE-LABEL: load_f64_v4f64:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: # kill: def $esi killed $esi def $rsi			; SSE-NEXT: # kill: def $esi killed $esi def $rsi
	; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero			; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
	; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)			; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
	; SSE-NEXT: andl $3, %esi			; SSE-NEXT: andl $3, %esi
	; SSE-NEXT: movsd %xmm2, -40(%rsp,%rsi,8)			; SSE-NEXT: movsd %xmm2, -40(%rsp,%rsi,8)
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
	; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1			; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: load_f64_v4f64:			; AVX1OR2-LABEL: load_f64_v4f64:
	; AVX: # %bb.0:			; AVX1OR2: # %bb.0:
	; AVX-NEXT: pushq %rbp			; AVX1OR2-NEXT: pushq %rbp
	; AVX-NEXT: movq %rsp, %rbp			; AVX1OR2-NEXT: movq %rsp, %rbp
	; AVX-NEXT: andq $-32, %rsp			; AVX1OR2-NEXT: andq $-32, %rsp
	; AVX-NEXT: subq $64, %rsp			; AVX1OR2-NEXT: subq $64, %rsp
	; AVX-NEXT: # kill: def $esi killed $esi def $rsi			; AVX1OR2-NEXT: # kill: def $esi killed $esi def $rsi
	; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero			; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
	; AVX-NEXT: vmovaps %ymm0, (%rsp)			; AVX1OR2-NEXT: vmovaps %ymm0, (%rsp)
	; AVX-NEXT: andl $3, %esi			; AVX1OR2-NEXT: andl $3, %esi
	; AVX-NEXT: vmovsd %xmm1, (%rsp,%rsi,8)			; AVX1OR2-NEXT: vmovsd %xmm1, (%rsp,%rsi,8)
	; AVX-NEXT: vmovaps (%rsp), %ymm0			; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0
	; AVX-NEXT: movq %rbp, %rsp			; AVX1OR2-NEXT: movq %rbp, %rsp
	; AVX-NEXT: popq %rbp			; AVX1OR2-NEXT: popq %rbp
	; AVX-NEXT: retq			; AVX1OR2-NEXT: retq
				;
				; AVX512-LABEL: load_f64_v4f64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: movslq %esi, %rax
				; AVX512-NEXT: vpbroadcastq %rax, %ymm1
				; AVX512-NEXT: vpcmpeqq {{.*}}(%rip), %ymm1, %k1
				; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1}
				; AVX512-NEXT: retq
	%x = load double, double* %p			%x = load double, double* %p
	%ins = insertelement <4 x double> %v, double %x, i32 %y			%ins = insertelement <4 x double> %v, double %x, i32 %y
	ret <4 x double> %ins			ret <4 x double> %ins
	}			}

	; Don't die trying to insert to an invalid index.			; Don't die trying to insert to an invalid index.

	define i32 @PR44139(<16 x i64>* %p) {			define i32 @PR44139(<16 x i64>* %p) {
	Show All 25 Lines