This is an archive of the discontinued LLVM Phabricator instance.

[AVX-512] Don't use unmasked VMOVDQU8/16 for 8-bit or 16-bit element stores even when BWI instructions are supported. Always use VMOVDQA32/VMOVDQU32.
ClosedPublic

Authored by craig.topper on Jul 27 2017, 11:45 PM.

Download Raw Diff

Details

Reviewers

zvi
RKSimon

Commits

rG2462a713ae4e: [AVX-512] Don't use unmasked VMOVDQU8/16 for 8-bit or 16-bit element stores…
rL309693: [AVX-512] Don't use unmasked VMOVDQU8/16 for 8-bit or 16-bit element stores…

Summary

We were already using the 32 bit element opcode if BWI isn't enabled, but there's no reason to change opcode if we have BWI. We will still use the 8/16 opcodes for masked stores though.

This allows us to use the aligned opcode when we can which makes our test output more consistent between different modes. It also reduces the number of isel patterns we need.

This is a slight inconsistency with loads which default to 64 bit element opcodes. I'll probably rectify that in a future patch.

Diff Detail

Event Timeline

craig.topper created this revision.Jul 27 2017, 11:45 PM

craig.topper added a parent revision: D35977: [AVX-512] Remove patterns that select vmovdqu8/16 for unmasked loads. Prefer vmovdqa64/vmovdqu64 instead..

RKSimon added inline comments.Jul 31 2017, 10:22 AM

test/CodeGen/X86/avx512-insert-extract.ll
5	Maybe add a --check-prefix=CHECK first option?
test/CodeGen/X86/subvector-broadcast.ll
903	Is this a missed execution domain opportunity? Same for the others below
test/CodeGen/X86/x86-interleaved-access.ll
4	Just noticed this is called AVX3?! Is that a good idea?

Fixed the execution domain issue in r309632.

Changed AVX3 to AVX512 in r309625.

Added common prefix to avx512-insert-extract.ll in r309629

LGTM

This revision is now accepted and ready to land.Aug 1 2017, 5:38 AM

Closed by commit rL309693: [AVX-512] Don't use unmasked VMOVDQU8/16 for 8-bit or 16-bit element stores… (authored by ctopper). · Explain WhyAug 1 2017, 8:32 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

X86InstrAVX512.td

42 lines

test/

CodeGen/

X86/

avg.ll

12 lines

avx512-insert-extract.ll

59 lines

avx512-insert-extract_i1.ll

2 lines

avx512bw-intrinsics-upgrade.ll

8 lines

avx512bw-mov.ll

4 lines

avx512bwvl-mov.ll

8 lines

subvector-broadcast.ll

152 lines

x86-interleaved-access.ll

6 lines

Diff 109014

lib/Target/X86/X86InstrAVX512.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,342 Lines • ▼ Show 20 Lines	defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
SelectOprr>, EVEX_V256;		SelectOprr>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,		defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
masked_load_unaligned, NoRMPattern,		masked_load_unaligned, NoRMPattern,
SelectOprr>, EVEX_V128;		SelectOprr>, EVEX_V128;
}		}
}		}

multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,		multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
PatFrag st_frag, PatFrag mstore, string Name> {		PatFrag st_frag, PatFrag mstore, string Name,
		bit NoMRPattern = 0> {

let hasSideEffects = 0 in {		let hasSideEffects = 0 in {
def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),		def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
OpcodeStr # ".s\t{$src, $dst\|$dst, $src}",		OpcodeStr # ".s\t{$src, $dst\|$dst, $src}",
[], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;		[], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),		def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),		(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # ".s\t{$src, ${dst} {${mask}}\|"#		OpcodeStr # ".s\t{$src, ${dst} {${mask}}\|"#
"${dst} {${mask}}, $src}",		"${dst} {${mask}}, $src}",
[], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>;		[], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>;
def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),		def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),		(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}\|" #		OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}\|" #
"${dst} {${mask}} {z}, $src}",		"${dst} {${mask}} {z}, $src}",
[], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;		[], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
}		}

		let hasSideEffects = 0, mayStore = 1 in
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),		def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),		!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
[(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;		!if(NoMRPattern, [],
		[(st_frag (_.VT _.RC:$src), addr:$dst)]),
		_.ExeDomain>, EVEX;
def mrk : AVX512PI<opc, MRMDestMem, (outs),		def mrk : AVX512PI<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),		(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, ${dst} {${mask}}\|${dst} {${mask}}, $src}",		OpcodeStr # "\t{$src, ${dst} {${mask}}\|${dst} {${mask}}, $src}",
[], _.ExeDomain>, EVEX, EVEX_K;		[], _.ExeDomain>, EVEX, EVEX_K;

def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),		def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
(!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,		(!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
_.KRCWM:$mask, _.RC:$src)>;		_.KRCWM:$mask, _.RC:$src)>;
}		}


multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,		multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,		AVX512VLVectorVTInfo _, Predicate prd,
string Name> {		string Name, bit NoMRPattern = 0> {
let Predicates = [prd] in		let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, _.info512, store,		defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
masked_store_unaligned, Name#Z>, EVEX_V512;		masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512;

let Predicates = [prd, HasVLX] in {		let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,		defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
masked_store_unaligned, Name#Z256>, EVEX_V256;		masked_store_unaligned, Name#Z256,
		NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,		defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
masked_store_unaligned, Name#Z128>, EVEX_V128;		masked_store_unaligned, Name#Z128,
		NoMRPattern>, EVEX_V128;
}		}
}		}

multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,		multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,		AVX512VLVectorVTInfo _, Predicate prd,
string Name> {		string Name> {
let Predicates = [prd] in		let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,		defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
Show All 40 Lines
defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,		defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
HasAVX512>,		HasAVX512>,
avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,		avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
HasAVX512, "VMOVDQA64">,		HasAVX512, "VMOVDQA64">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;		PD, VEX_W, EVEX_CD8<64, CD8VF>;

defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,		defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,
avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,		avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
HasBWI, "VMOVDQU8">,		HasBWI, "VMOVDQU8", 1>,
XD, EVEX_CD8<8, CD8VF>;		XD, EVEX_CD8<8, CD8VF>;

defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,		defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,
avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,		avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
HasBWI, "VMOVDQU16">,		HasBWI, "VMOVDQU16", 1>,
XD, VEX_W, EVEX_CD8<16, CD8VF>;		XD, VEX_W, EVEX_CD8<16, CD8VF>;

defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,		defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
0, null_frag>,		0, null_frag>,
avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,		avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
HasAVX512, "VMOVDQU32">,		HasAVX512, "VMOVDQU32">,
XS, EVEX_CD8<32, CD8VF>;		XS, EVEX_CD8<32, CD8VF>;

▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
(v16i32		(v16i32
(VMOVDQA32Zrrk		(VMOVDQA32Zrrk
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),		(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
(COPY_TO_REGCLASS VK8WM:$mask, VK16WM),		(COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),		(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
sub_ymm)>;		sub_ymm)>;
}		}

let Predicates = [HasVLX, NoBWI] in {		let Predicates = [HasAVX512] in {
// 128-bit load/store without BWI.		// 512-bit store.
		def : Pat<(alignedstore512 (v32i16 VR512:$src), addr:$dst),
		(VMOVDQA32Zmr addr:$dst, VR512:$src)>;
		def : Pat<(alignedstore512 (v64i8 VR512:$src), addr:$dst),
		(VMOVDQA32Zmr addr:$dst, VR512:$src)>;
		def : Pat<(store (v32i16 VR512:$src), addr:$dst),
		(VMOVDQU32Zmr addr:$dst, VR512:$src)>;
		def : Pat<(store (v64i8 VR512:$src), addr:$dst),
		(VMOVDQU32Zmr addr:$dst, VR512:$src)>;
		}

		let Predicates = [HasVLX] in {
		// 128-bit store.
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),		def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;		(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),		def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;		(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v8i16 VR128X:$src), addr:$dst),		def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;		(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(store (v16i8 VR128X:$src), addr:$dst),		def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;		(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;

// 256-bit load/store without BWI.		// 256-bit store.
def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),		def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;		(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),		def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;		(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v16i16 VR256X:$src), addr:$dst),		def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;		(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v32i8 VR256X:$src), addr:$dst),		def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;		(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
}

let Predicates = [HasVLX] in {
// Special patterns for storing subvector extracts of lower 128-bits of 256.		// Special patterns for storing subvector extracts of lower 128-bits of 256.
// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr		// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
def : Pat<(alignedstore (v2f64 (extract_subvector		def : Pat<(alignedstore (v2f64 (extract_subvector
(v4f64 VR256X:$src), (iPTR 0))), addr:$dst),		(v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
(VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;		(VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
def : Pat<(alignedstore (v4f32 (extract_subvector		def : Pat<(alignedstore (v4f32 (extract_subvector
(v8f32 VR256X:$src), (iPTR 0))), addr:$dst),		(v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
(VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;		(VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
▲ Show 20 Lines • Show All 6,717 Lines • Show Last 20 Lines

test/CodeGen/X86/avg.ll

	Show First 20 Lines • Show All 706 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vmovdqu %ymm0, (%rax)			; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
	; AVX512F-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512BW-LABEL: avg_v64i8:			; AVX512BW-LABEL: avg_v64i8:
	; AVX512BW: # BB#0:			; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0			; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
	; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0			; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
	; AVX512BW-NEXT: vzeroupper			; AVX512BW-NEXT: vzeroupper
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	%1 = load <64 x i8>, <64 x i8>* %a			%1 = load <64 x i8>, <64 x i8>* %a
	%2 = load <64 x i8>, <64 x i8>* %b			%2 = load <64 x i8>, <64 x i8>* %b
	%3 = zext <64 x i8> %1 to <64 x i32>			%3 = zext <64 x i8> %1 to <64 x i32>
	%4 = zext <64 x i8> %2 to <64 x i32>			%4 = zext <64 x i8> %2 to <64 x i32>
	%5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>			%5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	%6 = add nuw nsw <64 x i32> %5, %4			%6 = add nuw nsw <64 x i32> %5, %4
	▲ Show 20 Lines • Show All 372 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vpmovdw %zmm1, (%rax)			; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
	; AVX512F-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512BW-LABEL: avg_v32i16:			; AVX512BW-LABEL: avg_v32i16:
	; AVX512BW: # BB#0:			; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0			; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
	; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0			; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
	; AVX512BW-NEXT: vzeroupper			; AVX512BW-NEXT: vzeroupper
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	%1 = load <32 x i16>, <32 x i16>* %a			%1 = load <32 x i16>, <32 x i16>* %a
	%2 = load <32 x i16>, <32 x i16>* %b			%2 = load <32 x i16>, <32 x i16>* %b
	%3 = zext <32 x i16> %1 to <32 x i32>			%3 = zext <32 x i16> %1 to <32 x i32>
	%4 = zext <32 x i16> %2 to <32 x i32>			%4 = zext <32 x i16> %2 to <32 x i32>
	%5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>			%5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	%6 = add nuw nsw <32 x i32> %5, %4			%6 = add nuw nsw <32 x i32> %5, %4
	▲ Show 20 Lines • Show All 616 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vmovdqu %ymm0, (%rax)			; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
	; AVX512F-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512BW-LABEL: avg_v64i8_2:			; AVX512BW-LABEL: avg_v64i8_2:
	; AVX512BW: # BB#0:			; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0			; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
	; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0			; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
	; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
	; AVX512BW-NEXT: vzeroupper			; AVX512BW-NEXT: vzeroupper
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	%1 = load <64 x i8>, <64 x i8>* %a			%1 = load <64 x i8>, <64 x i8>* %a
	%2 = load <64 x i8>, <64 x i8>* %b			%2 = load <64 x i8>, <64 x i8>* %b
	%3 = zext <64 x i8> %1 to <64 x i32>			%3 = zext <64 x i8> %1 to <64 x i32>
	%4 = zext <64 x i8> %2 to <64 x i32>			%4 = zext <64 x i8> %2 to <64 x i32>
	%5 = add nuw nsw <64 x i32> %4, %4			%5 = add nuw nsw <64 x i32> %4, %4
	%6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>			%6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	▲ Show 20 Lines • Show All 373 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vpmovdw %zmm1, (%rax)			; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
	; AVX512F-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512BW-LABEL: avg_v32i16_2:			; AVX512BW-LABEL: avg_v32i16_2:
	; AVX512BW: # BB#0:			; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0			; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
	; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0			; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
	; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
	; AVX512BW-NEXT: vzeroupper			; AVX512BW-NEXT: vzeroupper
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	%1 = load <32 x i16>, <32 x i16>* %a			%1 = load <32 x i16>, <32 x i16>* %a
	%2 = load <32 x i16>, <32 x i16>* %b			%2 = load <32 x i16>, <32 x i16>* %b
	%3 = zext <32 x i16> %1 to <32 x i32>			%3 = zext <32 x i16> %1 to <32 x i32>
	%4 = zext <32 x i16> %2 to <32 x i32>			%4 = zext <32 x i16> %2 to <32 x i32>
	%5 = add nuw nsw <32 x i32> %3, %4			%5 = add nuw nsw <32 x i32> %3, %4
	%6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>			%6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	▲ Show 20 Lines • Show All 508 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vmovdqu %ymm2, (%rax)			; AVX512F-NEXT: vmovdqu %ymm2, (%rax)
	; AVX512F-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512BW-LABEL: avg_v64i8_const:			; AVX512BW-LABEL: avg_v64i8_const:
	; AVX512BW: # BB#0:			; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0			; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
	; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0			; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
	; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
	; AVX512BW-NEXT: vzeroupper			; AVX512BW-NEXT: vzeroupper
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	%1 = load <64 x i8>, <64 x i8>* %a			%1 = load <64 x i8>, <64 x i8>* %a
	%2 = zext <64 x i8> %1 to <64 x i32>			%2 = zext <64 x i8> %1 to <64 x i32>
	%3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>			%3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
	%4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>			%4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	%5 = trunc <64 x i32> %4 to <64 x i8>			%5 = trunc <64 x i32> %4 to <64 x i8>
	store <64 x i8> %5, <64 x i8>* undef, align 4			store <64 x i8> %5, <64 x i8>* undef, align 4
	▲ Show 20 Lines • Show All 291 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vpmovdw %zmm0, (%rax)			; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
	; AVX512F-NEXT: vzeroupper			; AVX512F-NEXT: vzeroupper
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512BW-LABEL: avg_v32i16_const:			; AVX512BW-LABEL: avg_v32i16_const:
	; AVX512BW: # BB#0:			; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0			; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
	; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0			; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
	; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
	; AVX512BW-NEXT: vzeroupper			; AVX512BW-NEXT: vzeroupper
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	%1 = load <32 x i16>, <32 x i16>* %a			%1 = load <32 x i16>, <32 x i16>* %a
	%2 = zext <32 x i16> %1 to <32 x i32>			%2 = zext <32 x i16> %1 to <32 x i32>
	%3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>			%3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
	%4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>			%4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	%5 = trunc <32 x i32> %4 to <32 x i16>			%5 = trunc <32 x i32> %4 to <32 x i16>
	store <32 x i16> %5, <32 x i16>* undef, align 4			store <32 x i16> %5, <32 x i16>* undef, align 4
	ret void			ret void
	}			}

test/CodeGen/X86/avx512-insert-extract.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck --check-prefix=CHECK --check-prefix=KNL %s			; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck --check-prefix=CHECK --check-prefix=KNL %s
	; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx \| FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s			; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx \| FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
	; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi \| FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s			; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi \| FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s

				RKSimonUnsubmitted Not Done Reply Inline Actions Maybe add a --check-prefix=CHECK first option? RKSimon: Maybe add a --check-prefix=CHECK first option?
	define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {			define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
	; CHECK-LABEL: test1:			; CHECK-LABEL: test1:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]			; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
	; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2			; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
	; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0			; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
	; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]			; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
	; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0			; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
	▲ Show 20 Lines • Show All 1,820 Lines • ▼ Show 20 Lines
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t2 = extractelement <16 x float> %t1, i32 %index			%t2 = extractelement <16 x float> %t1, i32 %index
	ret float %t2			ret float %t2
	}			}

	define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {			define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
	; KNL-LABEL: test_extractelement_variable_v8i16:			; CHECK-LABEL: test_extractelement_variable_v8i16:
	; KNL: ## BB#0:			; CHECK: ## BB#0:
	; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; KNL-NEXT: andl $7, %edi			; CHECK-NEXT: andl $7, %edi
	; KNL-NEXT: movzwl -24(%rsp,%rdi,2), %eax			; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax
	; KNL-NEXT: retq			; CHECK-NEXT: retq
	;
	; SKX-LABEL: test_extractelement_variable_v8i16:
	; SKX: ## BB#0:
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
	; SKX-NEXT: andl $7, %edi
	; SKX-NEXT: movzwl -24(%rsp,%rdi,2), %eax
	; SKX-NEXT: retq
	%t2 = extractelement <8 x i16> %t1, i32 %index			%t2 = extractelement <8 x i16> %t1, i32 %index
	ret i16 %t2			ret i16 %t2
	}			}

	define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {			define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
	; KNL-LABEL: test_extractelement_variable_v16i16:			; KNL-LABEL: test_extractelement_variable_v16i16:
	; KNL: ## BB#0:			; KNL: ## BB#0:
	; KNL-NEXT: pushq %rbp			; KNL-NEXT: pushq %rbp
	Show All 22 Lines
	; SKX-NEXT: Lcfi25:			; SKX-NEXT: Lcfi25:
	; SKX-NEXT: .cfi_offset %rbp, -16			; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi26:			; SKX-NEXT: Lcfi26:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-32, %rsp			; SKX-NEXT: andq $-32, %rsp
	; SKX-NEXT: subq $64, %rsp			; SKX-NEXT: subq $64, %rsp
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vmovdqu %ymm0, (%rsp)			; SKX-NEXT: vmovaps %ymm0, (%rsp)
	; SKX-NEXT: andl $15, %edi			; SKX-NEXT: andl $15, %edi
	; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax			; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t2 = extractelement <16 x i16> %t1, i32 %index			%t2 = extractelement <16 x i16> %t1, i32 %index
	ret i16 %t2			ret i16 %t2
	Show All 29 Lines
	; SKX-NEXT: Lcfi28:			; SKX-NEXT: Lcfi28:
	; SKX-NEXT: .cfi_offset %rbp, -16			; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi29:			; SKX-NEXT: Lcfi29:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-64, %rsp			; SKX-NEXT: andq $-64, %rsp
	; SKX-NEXT: subq $128, %rsp			; SKX-NEXT: subq $128, %rsp
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vmovdqu16 %zmm0, (%rsp)			; SKX-NEXT: vmovaps %zmm0, (%rsp)
	; SKX-NEXT: andl $31, %edi			; SKX-NEXT: andl $31, %edi
	; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax			; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t2 = extractelement <32 x i16> %t1, i32 %index			%t2 = extractelement <32 x i16> %t1, i32 %index
	ret i16 %t2			ret i16 %t2
	}			}

	define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {			define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
	; KNL-LABEL: test_extractelement_variable_v16i8:			; CHECK-LABEL: test_extractelement_variable_v16i8:
	; KNL: ## BB#0:			; CHECK: ## BB#0:
	; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)			; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
	; KNL-NEXT: andl $15, %edi			; CHECK-NEXT: andl $15, %edi
	; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax			; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
	; KNL-NEXT: movb (%rdi,%rax), %al			; CHECK-NEXT: movb (%rdi,%rax), %al
	; KNL-NEXT: retq			; CHECK-NEXT: retq
	;
	; SKX-LABEL: test_extractelement_variable_v16i8:
	; SKX: ## BB#0:
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
	; SKX-NEXT: andl $15, %edi
	; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
	; SKX-NEXT: movb (%rdi,%rax), %al
	; SKX-NEXT: retq
	%t2 = extractelement <16 x i8> %t1, i32 %index			%t2 = extractelement <16 x i8> %t1, i32 %index
	ret i8 %t2			ret i8 %t2
	}			}

	define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {			define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
	; KNL-LABEL: test_extractelement_variable_v32i8:			; KNL-LABEL: test_extractelement_variable_v32i8:
	; KNL: ## BB#0:			; KNL: ## BB#0:
	; KNL-NEXT: pushq %rbp			; KNL-NEXT: pushq %rbp
	Show All 23 Lines
	; SKX-NEXT: Lcfi31:			; SKX-NEXT: Lcfi31:
	; SKX-NEXT: .cfi_offset %rbp, -16			; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi32:			; SKX-NEXT: Lcfi32:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-32, %rsp			; SKX-NEXT: andq $-32, %rsp
	; SKX-NEXT: subq $64, %rsp			; SKX-NEXT: subq $64, %rsp
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vmovdqu %ymm0, (%rsp)			; SKX-NEXT: vmovaps %ymm0, (%rsp)
	; SKX-NEXT: andl $31, %edi			; SKX-NEXT: andl $31, %edi
	; SKX-NEXT: movq %rsp, %rax			; SKX-NEXT: movq %rsp, %rax
	; SKX-NEXT: movb (%rdi,%rax), %al			; SKX-NEXT: movb (%rdi,%rax), %al
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq

	Show All 32 Lines
	; SKX-NEXT: Lcfi34:			; SKX-NEXT: Lcfi34:
	; SKX-NEXT: .cfi_offset %rbp, -16			; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi35:			; SKX-NEXT: Lcfi35:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-64, %rsp			; SKX-NEXT: andq $-64, %rsp
	; SKX-NEXT: subq $128, %rsp			; SKX-NEXT: subq $128, %rsp
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)			; SKX-NEXT: vmovaps %zmm0, (%rsp)
	; SKX-NEXT: andl $63, %edi			; SKX-NEXT: andl $63, %edi
	; SKX-NEXT: movq %rsp, %rax			; SKX-NEXT: movq %rsp, %rax
	; SKX-NEXT: movb (%rdi,%rax), %al			; SKX-NEXT: movb (%rdi,%rax), %al
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq

	Show All 33 Lines
	; SKX-NEXT: Lcfi37:			; SKX-NEXT: Lcfi37:
	; SKX-NEXT: .cfi_offset %rbp, -16			; SKX-NEXT: .cfi_offset %rbp, -16
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi38:			; SKX-NEXT: Lcfi38:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-64, %rsp			; SKX-NEXT: andq $-64, %rsp
	; SKX-NEXT: subq $128, %rsp			; SKX-NEXT: subq $128, %rsp
	; SKX-NEXT: addb %dil, %dil			; SKX-NEXT: addb %dil, %dil
	; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)			; SKX-NEXT: vmovaps %zmm0, (%rsp)
	; SKX-NEXT: movzbl %dil, %eax			; SKX-NEXT: movzbl %dil, %eax
	; SKX-NEXT: andl $63, %eax			; SKX-NEXT: andl $63, %eax
	; SKX-NEXT: movq %rsp, %rcx			; SKX-NEXT: movq %rsp, %rcx
	; SKX-NEXT: movb (%rax,%rcx), %al			; SKX-NEXT: movb (%rax,%rcx), %al
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	▲ Show 20 Lines • Show All 208 Lines • ▼ Show 20 Lines
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi47:			; SKX-NEXT: Lcfi47:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-64, %rsp			; SKX-NEXT: andq $-64, %rsp
	; SKX-NEXT: subq $128, %rsp			; SKX-NEXT: subq $128, %rsp
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0			; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
	; SKX-NEXT: vpmovm2w %k0, %zmm0			; SKX-NEXT: vpmovm2w %k0, %zmm0
	; SKX-NEXT: vmovdqu16 %zmm0, (%rsp)			; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
	; SKX-NEXT: andl $31, %edi			; SKX-NEXT: andl $31, %edi
	; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax			; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax
	; SKX-NEXT: andl $1, %eax			; SKX-NEXT: andl $1, %eax
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t1 = icmp ugt <32 x i8> %a, %b			%t1 = icmp ugt <32 x i8> %a, %b
	%t2 = extractelement <32 x i1> %t1, i32 %index			%t2 = extractelement <32 x i1> %t1, i32 %index
	%res = zext i1 %t2 to i8			%res = zext i1 %t2 to i8
	ret i8 %res			ret i8 %res
	}			}

test/CodeGen/X86/avx512-insert-extract_i1.ll

	Show All 13 Lines
	; SKX-NEXT: movq %rsp, %rbp			; SKX-NEXT: movq %rsp, %rbp
	; SKX-NEXT: Lcfi2:			; SKX-NEXT: Lcfi2:
	; SKX-NEXT: .cfi_def_cfa_register %rbp			; SKX-NEXT: .cfi_def_cfa_register %rbp
	; SKX-NEXT: andq $-64, %rsp			; SKX-NEXT: andq $-64, %rsp
	; SKX-NEXT: subq $128, %rsp			; SKX-NEXT: subq $128, %rsp
	; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>			; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
	; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0			; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
	; SKX-NEXT: vpmovm2b %k0, %zmm0			; SKX-NEXT: vpmovm2b %k0, %zmm0
	; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)			; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
	; SKX-NEXT: andl $63, %edi			; SKX-NEXT: andl $63, %edi
	; SKX-NEXT: movq %rsp, %rax			; SKX-NEXT: movq %rsp, %rax
	; SKX-NEXT: movzbl (%rdi,%rax), %eax			; SKX-NEXT: movzbl (%rdi,%rax), %eax
	; SKX-NEXT: andl $1, %eax			; SKX-NEXT: andl $1, %eax
	; SKX-NEXT: movq %rbp, %rsp			; SKX-NEXT: movq %rbp, %rsp
	; SKX-NEXT: popq %rbp			; SKX-NEXT: popq %rbp
	; SKX-NEXT: vzeroupper			; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq			; SKX-NEXT: retq
	%t1 = icmp ugt <64 x i8> %a, %b			%t1 = icmp ugt <64 x i8> %a, %b
	%t2 = extractelement <64 x i1> %t1, i32 %index			%t2 = extractelement <64 x i1> %t1, i32 %index
	%res = zext i1 %t2 to i8			%res = zext i1 %t2 to i8
	ret i8 %res			ret i8 %res
	}			}

test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW			; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
	; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32			; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32

	declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)			declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)

	define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {			define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:			; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:
	; AVX512BW: ## BB#0:			; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdx, %k1			; AVX512BW-NEXT: kmovq %rdx, %k1
	; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}			; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
	; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi)
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	;			;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:			; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
	; AVX512F-32: # BB#0:			; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax			; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx			; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1			; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1}			; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1}
	; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax)			; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax)
	; AVX512F-32-NEXT: retl			; AVX512F-32-NEXT: retl
	call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)			call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
	call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)			call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
	ret void			ret void
	}			}

	declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)			declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)

	define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {			define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:			; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:
	; AVX512BW: ## BB#0:			; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edx, %k1			; AVX512BW-NEXT: kmovd %edx, %k1
	; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}			; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
	; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi)			; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi)
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	;			;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:			; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
	; AVX512F-32: # BB#0:			; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax			; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx			; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1			; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1}			; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1}
	; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax)			; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax)
	; AVX512F-32-NEXT: retl			; AVX512F-32-NEXT: retl
	call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)			call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
	call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)			call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
	ret void			ret void
	}			}

	declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)			declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)

	▲ Show 20 Lines • Show All 3,564 Lines • Show Last 20 Lines

test/CodeGen/X86/avx512bw-mov.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw \| FileCheck %s		; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw \| FileCheck %s

define <64 x i8> @test1(i8 * %addr) {		define <64 x i8> @test1(i8 * %addr) {
; CHECK-LABEL: test1:		; CHECK-LABEL: test1:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovups (%rdi), %zmm0		; CHECK-NEXT: vmovups (%rdi), %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*		%vaddr = bitcast i8* %addr to <64 x i8>*
%res = load <64 x i8>, <64 x i8>* %vaddr, align 1		%res = load <64 x i8>, <64 x i8>* %vaddr, align 1
ret <64 x i8>%res		ret <64 x i8>%res
}		}

define void @test2(i8 * %addr, <64 x i8> %data) {		define void @test2(i8 * %addr, <64 x i8> %data) {
; CHECK-LABEL: test2:		; CHECK-LABEL: test2:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu8 %zmm0, (%rdi)		; CHECK-NEXT: vmovups %zmm0, (%rdi)
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*		%vaddr = bitcast i8* %addr to <64 x i8>*
store <64 x i8>%data, <64 x i8>* %vaddr, align 1		store <64 x i8>%data, <64 x i8>* %vaddr, align 1
ret void		ret void
}		}

define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {		define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
; CHECK-LABEL: test3:		; CHECK-LABEL: test3:
Show All 31 Lines	; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*		%vaddr = bitcast i8* %addr to <32 x i16>*
%res = load <32 x i16>, <32 x i16>* %vaddr, align 1		%res = load <32 x i16>, <32 x i16>* %vaddr, align 1
ret <32 x i16>%res		ret <32 x i16>%res
}		}

define void @test6(i8 * %addr, <32 x i16> %data) {		define void @test6(i8 * %addr, <32 x i16> %data) {
; CHECK-LABEL: test6:		; CHECK-LABEL: test6:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu16 %zmm0, (%rdi)		; CHECK-NEXT: vmovups %zmm0, (%rdi)
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*		%vaddr = bitcast i8* %addr to <32 x i16>*
store <32 x i16>%data, <32 x i16>* %vaddr, align 1		store <32 x i16>%data, <32 x i16>* %vaddr, align 1
ret void		ret void
}		}

define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {		define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
; CHECK-LABEL: test7:		; CHECK-LABEL: test7:
▲ Show 20 Lines • Show All 145 Lines • Show Last 20 Lines

test/CodeGen/X86/avx512bwvl-mov.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding\| FileCheck %s		; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding\| FileCheck %s

define <32 x i8> @test_256_1(i8 * %addr) {		define <32 x i8> @test_256_1(i8 * %addr) {
; CHECK-LABEL: test_256_1:		; CHECK-LABEL: test_256_1:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]		; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]		; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*		%vaddr = bitcast i8* %addr to <32 x i8>*
%res = load <32 x i8>, <32 x i8>* %vaddr, align 1		%res = load <32 x i8>, <32 x i8>* %vaddr, align 1
ret <32 x i8>%res		ret <32 x i8>%res
}		}

define void @test_256_2(i8 * %addr, <32 x i8> %data) {		define void @test_256_2(i8 * %addr, <32 x i8> %data) {
; CHECK-LABEL: test_256_2:		; CHECK-LABEL: test_256_2:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07]		; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]		; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*		%vaddr = bitcast i8* %addr to <32 x i8>*
store <32 x i8>%data, <32 x i8>* %vaddr, align 1		store <32 x i8>%data, <32 x i8>* %vaddr, align 1
ret void		ret void
}		}

define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {		define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
; CHECK-LABEL: test_256_3:		; CHECK-LABEL: test_256_3:
Show All 31 Lines	; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*		%vaddr = bitcast i8* %addr to <16 x i16>*
%res = load <16 x i16>, <16 x i16>* %vaddr, align 1		%res = load <16 x i16>, <16 x i16>* %vaddr, align 1
ret <16 x i16>%res		ret <16 x i16>%res
}		}

define void @test_256_6(i8 * %addr, <16 x i16> %data) {		define void @test_256_6(i8 * %addr, <16 x i16> %data) {
; CHECK-LABEL: test_256_6:		; CHECK-LABEL: test_256_6:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07]		; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]		; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*		%vaddr = bitcast i8* %addr to <16 x i16>*
store <16 x i16>%data, <16 x i16>* %vaddr, align 1		store <16 x i16>%data, <16 x i16>* %vaddr, align 1
ret void		ret void
}		}

define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {		define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
; CHECK-LABEL: test_256_7:		; CHECK-LABEL: test_256_7:
Show All 31 Lines	; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*		%vaddr = bitcast i8* %addr to <16 x i8>*
%res = load <16 x i8>, <16 x i8>* %vaddr, align 1		%res = load <16 x i8>, <16 x i8>* %vaddr, align 1
ret <16 x i8>%res		ret <16 x i8>%res
}		}

define void @test_128_2(i8 * %addr, <16 x i8> %data) {		define void @test_128_2(i8 * %addr, <16 x i8> %data) {
; CHECK-LABEL: test_128_2:		; CHECK-LABEL: test_128_2:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07]		; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]		; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*		%vaddr = bitcast i8* %addr to <16 x i8>*
store <16 x i8>%data, <16 x i8>* %vaddr, align 1		store <16 x i8>%data, <16 x i8>* %vaddr, align 1
ret void		ret void
}		}

define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {		define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
; CHECK-LABEL: test_128_3:		; CHECK-LABEL: test_128_3:
Show All 31 Lines	; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*		%vaddr = bitcast i8* %addr to <8 x i16>*
%res = load <8 x i16>, <8 x i16>* %vaddr, align 1		%res = load <8 x i16>, <8 x i16>* %vaddr, align 1
ret <8 x i16>%res		ret <8 x i16>%res
}		}

define void @test_128_6(i8 * %addr, <8 x i16> %data) {		define void @test_128_6(i8 * %addr, <8 x i16> %data) {
; CHECK-LABEL: test_128_6:		; CHECK-LABEL: test_128_6:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07]		; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]		; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*		%vaddr = bitcast i8* %addr to <8 x i16>*
store <8 x i16>%data, <8 x i16>* %vaddr, align 1		store <8 x i16>%data, <8 x i16>* %vaddr, align 1
ret void		ret void
}		}

define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {		define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
; CHECK-LABEL: test_128_7:		; CHECK-LABEL: test_128_7:
Show All 26 Lines

test/CodeGen/X86/subvector-broadcast.ll

	Show First 20 Lines • Show All 888 Lines • ▼ Show 20 Lines
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <4 x i32>, <4 x i32>* %p0			%1 = load <4 x i32>, <4 x i32>* %p0
	store <4 x i32> %1, <4 x i32>* %p1			store <4 x i32> %1, <4 x i32>* %p1
	%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>			%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
	ret <8 x i32> %2			ret <8 x i32> %2
	}			}

	define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> %p0, <8 x i16> %p1) nounwind {			define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> %p0, <8 x i16> %p1) nounwind {
	; X32-AVX-LABEL: test_broadcast_8i16_16i16_reuse:			; X32-LABEL: test_broadcast_8i16_16i16_reuse:
	; X32-AVX: ## BB#0:			; X32: ## BB#0:
	; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX-NEXT: vmovaps (%ecx), %xmm0			; X32-NEXT: vmovaps (%ecx), %xmm0
	; X32-AVX-NEXT: vmovaps %xmm0, (%eax)			; X32-NEXT: vmovaps %xmm0, (%eax)
	; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
				RKSimonUnsubmitted Not Done Reply Inline Actions Is this a missed execution domain opportunity? Same for the others below RKSimon: Is this a missed execution domain opportunity? Same for the others below
	; X32-AVX-NEXT: retl			; X32-NEXT: retl
	;
	; X32-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse:
	; X32-AVX512F: ## BB#0:
	; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
	; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax)
	; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX512F-NEXT: retl
	;
	; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
	; X32-AVX512BW: ## BB#0:
	; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
	; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
	; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX512BW-NEXT: retl
	;
	; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
	; X32-AVX512DQ: ## BB#0:
	; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0
	; X32-AVX512DQ-NEXT: vmovaps %xmm0, (%eax)
	; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX512DQ-NEXT: retl
	;
	; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
	; X64-AVX: ## BB#0:
	; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
	; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX-NEXT: retq
	;
	; X64-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse:
	; X64-AVX512F: ## BB#0:
	; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi)
	; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX512F-NEXT: retq
	;
	; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
	; X64-AVX512BW: ## BB#0:
	; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
	; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
	; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX512BW-NEXT: retq
	;			;
	; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:			; X64-LABEL: test_broadcast_8i16_16i16_reuse:
	; X64-AVX512DQ: ## BB#0:			; X64: ## BB#0:
	; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX512DQ-NEXT: vmovaps %xmm0, (%rsi)			; X64-NEXT: vmovaps %xmm0, (%rsi)
	; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX512DQ-NEXT: retq			; X64-NEXT: retq
	%1 = load <8 x i16>, <8 x i16> *%p0			%1 = load <8 x i16>, <8 x i16> *%p0
	store <8 x i16> %1, <8 x i16>* %p1			store <8 x i16> %1, <8 x i16>* %p1
	%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <16 x i16> %2			ret <16 x i16> %2
	}			}

	define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> %p0, <16 x i8> %p1) nounwind {			define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> %p0, <16 x i8> %p1) nounwind {
	; X32-AVX-LABEL: test_broadcast_16i8_32i8_reuse:			; X32-LABEL: test_broadcast_16i8_32i8_reuse:
	; X32-AVX: ## BB#0:			; X32: ## BB#0:
	; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX-NEXT: vmovaps (%ecx), %xmm0			; X32-NEXT: vmovaps (%ecx), %xmm0
	; X32-AVX-NEXT: vmovaps %xmm0, (%eax)			; X32-NEXT: vmovaps %xmm0, (%eax)
	; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX-NEXT: retl			; X32-NEXT: retl
	;
	; X32-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse:
	; X32-AVX512F: ## BB#0:
	; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
	; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax)
	; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX512F-NEXT: retl
	;
	; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
	; X32-AVX512BW: ## BB#0:
	; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
	; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
	; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX512BW-NEXT: retl
	;
	; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
	; X32-AVX512DQ: ## BB#0:
	; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0
	; X32-AVX512DQ-NEXT: vmovaps %xmm0, (%eax)
	; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-AVX512DQ-NEXT: retl
	;
	; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
	; X64-AVX: ## BB#0:
	; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
	; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX-NEXT: retq
	;
	; X64-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse:
	; X64-AVX512F: ## BB#0:
	; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi)
	; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX512F-NEXT: retq
	;
	; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
	; X64-AVX512BW: ## BB#0:
	; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
	; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
	; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX512BW-NEXT: retq
	;			;
	; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:			; X64-LABEL: test_broadcast_16i8_32i8_reuse:
	; X64-AVX512DQ: ## BB#0:			; X64: ## BB#0:
	; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX512DQ-NEXT: vmovaps %xmm0, (%rsi)			; X64-NEXT: vmovaps %xmm0, (%rsi)
	; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-AVX512DQ-NEXT: retq			; X64-NEXT: retq
	%1 = load <16 x i8>, <16 x i8> *%p0			%1 = load <16 x i8>, <16 x i8> *%p0
	store <16 x i8> %1, <16 x i8>* %p1			store <16 x i8> %1, <16 x i8>* %p1
	%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <32 x i8> %2			ret <32 x i8> %2
	}			}

	;			;
	; Subvector Load + Broadcast with Separate Store			; Subvector Load + Broadcast with Separate Store
	▲ Show 20 Lines • Show All 341 Lines • Show Last 20 Lines

test/CodeGen/X86/x86-interleaved-access.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx \| FileCheck %s --check-prefix=AVX1			; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx \| FileCheck %s --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX2			; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX2
	; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw \| FileCheck %s --check-prefix=AVX --check-prefix=AVX512			; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw \| FileCheck %s --check-prefix=AVX --check-prefix=AVX512
				RKSimonUnsubmitted Not Done Reply Inline Actions Just noticed this is called AVX3?! Is that a good idea? RKSimon: Just noticed this is called AVX3?! Is that a good idea?

	define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {			define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
	; AVX1-LABEL: load_factorf64_4:			; AVX1-LABEL: load_factorf64_4:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vmovupd (%rdi), %ymm0			; AVX1-NEXT: vmovupd (%rdi), %ymm0
	; AVX1-NEXT: vmovupd 32(%rdi), %ymm1			; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
	; AVX1-NEXT: vmovupd 64(%rdi), %ymm2			; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
	; AVX1-NEXT: vmovupd 96(%rdi), %ymm3			; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
	▲ Show 20 Lines • Show All 343 Lines • ▼ Show 20 Lines
	; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]			; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
	; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]			; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
	; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2			; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
	; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4			; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4
	; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]			; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
	; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]			; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
	; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2			; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
	; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0			; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
	; AVX512-NEXT: vmovdqu8 %zmm0, 64(%rdi)			; AVX512-NEXT: vmovdqa32 %zmm0, 64(%rdi)
	; AVX512-NEXT: vmovdqu8 %zmm2, (%rdi)			; AVX512-NEXT: vmovdqa32 %zmm2, (%rdi)
	; AVX512-NEXT: vzeroupper			; AVX512-NEXT: vzeroupper
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>			%v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
	%v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>			%v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
	%interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>			%interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
	store <128 x i8> %interleaved.vec, <128 x i8>* %p			store <128 x i8> %interleaved.vec, <128 x i8>* %p
	ret void			ret void
	}			}
	▲ Show 20 Lines • Show All 65 Lines • ▼ Show 20 Lines
	; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23]			; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23]
	; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u]			; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u]
	; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1			; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
	; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u]			; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u]
	; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u,u]			; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u,u]
	; AVX512-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0			; AVX512-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0
	; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]			; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0			; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512-NEXT: vmovdqu8 %zmm0, (%rdi)			; AVX512-NEXT: vmovdqa32 %zmm0, (%rdi)
	; AVX512-NEXT: vzeroupper			; AVX512-NEXT: vzeroupper
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>			%v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	%v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>			%v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	%interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>			%interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
	store <64 x i8> %interleaved.vec, <64 x i8>* %p			store <64 x i8> %interleaved.vec, <64 x i8>* %p
	ret void			ret void
	}			}
	▲ Show 20 Lines • Show All 544 Lines • Show Last 20 Lines