This is an archive of the discontinued LLVM Phabricator instance.

[X86] Use vmovq for v4i64/v4f64/v8i64/v8f64 vzmovl.
ClosedPublic

Authored by craig.topper on Jun 15 2019, 12:19 AM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel

Commits

rG6af1be96641f: [X86] Use vmovq for v4i64/v4f64/v8i64/v8f64 vzmovl.

Summary

We already use vmovq for v2i64/v2f64 vzmovl. But we were using a
blend with 0 for v4i64/v4f64 and vmovsd with 0 for v8i64/v8f64.

I think the blend with 0 or scalar movss/d is only needed for
vXi32 where we don't have an instruction that can move 32
bits from one xmm to another while zeroing upper bits.

Diff Detail

Repository

rG LLVM Github Monorepo

Build Status

Buildable 33440
Build 33439: arc lint + arc unit

Event Timeline

craig.topper created this revision.Jun 15 2019, 12:19 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 15 2019, 12:19 AM

Herald added a subscriber: hiraditya. · View Herald Transcript

Harbormaster completed remote builds in B33440: Diff 204902.Jun 15 2019, 12:22 AM

PR34876 and PR34874 suggests we should prefer BLEND over MOVSD/MOVQ etc.?

I think those titles were based on what I thought we were doing. I was probably confused by v4i64/v8i64 using blend without noticing v2i64 isn’t. I think movq is available on ports 0/1/5 on Sandy Bridge.

craig.topper mentioned this in D63512: [X86] Add DAG combine to turn (vzmovl (insert_subvector undef, X, 0)) into (insert_subvector allzeros, (vzmovl X), 0).Jun 18 2019, 12:53 PM

In D63373#1546183, @craig.topper wrote:

I think those titles were based on what I thought we were doing. I was probably confused by v4i64/v8i64 using blend without noticing v2i64 isn’t. I think movq is available on ports 0/1/5 on Sandy Bridge.

LGTM - I can't find a case where MOVQ is worse than VBLEND/VPBLEND - please can you update PR34876 and PR34874 to make that clear.

RKSimon accepted this revision.Jun 21 2019, 3:35 AM

This revision is now accepted and ready to land.Jun 21 2019, 3:35 AM

Committed in r364079

craig.topper added a commit: rG6af1be96641f: [X86] Use vmovq for v4i64/v4f64/v8i64/v8f64 vzmovl..Jun 23 2019, 11:39 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86InstrAVX512.td

42 lines

X86InstrSSE.td

35 lines

test/

CodeGen/

X86/

vec_extract-avx.ll

28 lines

vector-extend-inreg.ll

16 lines

vector-shuffle-256-v4.ll

9 lines

vector-shuffle-512-v8.ll

6 lines

vector-shuffle-combining-avx2.ll

3 lines

Diff 204902

llvm/lib/Target/X86/X86InstrAVX512.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,292 Lines • ▼ Show 20 Lines	def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),		(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
(v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>;		(v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>;
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),		def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),		(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;		(v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;

def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
(SUBREG_TO_REG (i32 0),
(v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
(v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
(SUBREG_TO_REG (i32 0),
(v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
(v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>;

def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),		def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),		(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
(v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>;		(v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>;
def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),		def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),		(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;		(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;

def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
(v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>;

def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
(v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>;

}		}

// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than		// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.		// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
let Predicates = [HasAVX512, OptForSpeed] in {		let Predicates = [HasAVX512, OptForSpeed] in {
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),		def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),		(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
▲ Show 20 Lines • Show All 126 Lines • ▼ Show 20 Lines	def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),		(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;		(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;

// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.		// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
def : Pat<(v16i32 (X86vzload addr:$src)),		def : Pat<(v16i32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;		(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v8i64 (X86vzload addr:$src)),		def : Pat<(v8i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;		(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;

		def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
		(SUBREG_TO_REG (i32 0),
		(v2f64 (VMOVZPQILo2PQIZrr
		(v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))),
		sub_xmm)>;
		def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
		(SUBREG_TO_REG (i32 0),
		(v2i64 (VMOVZPQILo2PQIZrr
		(v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))),
		sub_xmm)>;

		def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
		(SUBREG_TO_REG (i32 0),
		(v2f64 (VMOVZPQILo2PQIZrr
		(v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))),
		sub_xmm)>;
		def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
		(SUBREG_TO_REG (i32 0),
		(v2i64 (VMOVZPQILo2PQIZrr
		(v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))),
		sub_xmm)>;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// AVX-512 - Non-temporals		// AVX-512 - Non-temporals
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),		def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
(ins i512mem:$src), "vmovntdqa\t{$src, $dst\|$dst, $src}",		(ins i512mem:$src), "vmovntdqa\t{$src, $dst\|$dst, $src}",
▲ Show 20 Lines • Show All 8,018 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 294 Lines • ▼ Show 20 Lines	let Predicates = [UseAVX, OptForSize] in {
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),		def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSrr (v4f32 (V_SET0)),		(v4f32 (VMOVSSrr (v4f32 (V_SET0)),
(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;		(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),		def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSrr (v4i32 (V_SET0)),		(v4i32 (VMOVSSrr (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;		(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;

def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(v2f64 (VMOVSDrr (v2f64 (V_SET0)),
(v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(v2i64 (VMOVSDrr (v2i64 (V_SET0)),
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
sub_xmm)>;
}		}

let Predicates = [UseSSE1] in {		let Predicates = [UseSSE1] in {
let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {		let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a		// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.		// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),		def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;		(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
▲ Show 20 Lines • Show All 4,001 Lines • ▼ Show 20 Lines	let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),		def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
(VMOVZPQILo2PQIrr VR128:$src)>;		(VMOVZPQILo2PQIrr VR128:$src)>;
}		}
let Predicates = [UseSSE2] in {		let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),		def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
(MOVZPQILo2PQIrr VR128:$src)>;		(MOVZPQILo2PQIrr VR128:$src)>;
}		}

		let Predicates = [UseAVX] in {
		def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
		(SUBREG_TO_REG (i32 0),
		(v2f64 (VMOVZPQILo2PQIrr
		(v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
		sub_xmm)>;
		def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
		(SUBREG_TO_REG (i32 0),
		(v2i64 (VMOVZPQILo2PQIrr
		(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
		sub_xmm)>;
		}

//===---------------------------------------------------------------------===//		//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP		// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//		//===---------------------------------------------------------------------===//

multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,		multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
ValueType vt, RegisterClass RC, PatFrag mem_frag,		ValueType vt, RegisterClass RC, PatFrag mem_frag,
X86MemOperand x86memop, X86FoldableSchedWrite sched> {		X86MemOperand x86memop, X86FoldableSchedWrite sched> {
def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),		def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
▲ Show 20 Lines • Show All 2,011 Lines • ▼ Show 20 Lines	def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),		(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),		(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
(i8 1))), sub_xmm)>;		(i8 1))), sub_xmm)>;
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),		def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
(SUBREG_TO_REG (i32 0),		(SUBREG_TO_REG (i32 0),
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),		(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),		(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
(i8 3))), sub_xmm)>;		(i8 3))), sub_xmm)>;

def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
(v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
(i8 1))), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
(i8 0xf))), sub_xmm)>;
}		}

// Prefer a movss or movsd over a blendps when optimizing for size. these were		// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge		// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.		// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
let Predicates = [UseSSE41, OptForSpeed] in {		let Predicates = [UseSSE41, OptForSpeed] in {
// With SSE41 we can use blends for these patterns.		// With SSE41 we can use blends for these patterns.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),		def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
▲ Show 20 Lines • Show All 1,751 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vec_extract-avx.ll

Show First 20 Lines • Show All 138 Lines • ▼ Show 20 Lines	; X64-NEXT: retq
ret void		ret void
}		}

define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {		define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
; X32-LABEL: legal_vzmovl_2i64_4i64:		; X32-LABEL: legal_vzmovl_2i64_4i64:
; X32: # %bb.0:		; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx		; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups (%ecx), %xmm0		; X32-NEXT: vmovdqu (%ecx), %xmm0
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1		; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; X32-NEXT: vmovdqa %ymm0, (%eax)
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper		; X32-NEXT: vzeroupper
; X32-NEXT: retl		; X32-NEXT: retl
;		;
; X64-LABEL: legal_vzmovl_2i64_4i64:		; X64-LABEL: legal_vzmovl_2i64_4i64:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %xmm0		; X64-NEXT: vmovdqu (%rdi), %xmm0
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1		; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; X64-NEXT: vmovdqa %ymm0, (%rsi)
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper		; X64-NEXT: vzeroupper
; X64-NEXT: retq		; X64-NEXT: retq
%ld = load <2 x i64>, <2 x i64>* %in, align 8		%ld = load <2 x i64>, <2 x i64>* %in, align 8
%ext = extractelement <2 x i64> %ld, i64 0		%ext = extractelement <2 x i64> %ld, i64 0
%ins = insertelement <4 x i64> <i64 undef, i64 0, i64 0, i64 0>, i64 %ext, i64 0		%ins = insertelement <4 x i64> <i64 undef, i64 0, i64 0, i64 0>, i64 %ext, i64 0
store <4 x i64> %ins, <4 x i64>* %out, align 32		store <4 x i64> %ins, <4 x i64>* %out, align 32
ret void		ret void
}		}
Show All 25 Lines	; X64-NEXT: retq
ret void		ret void
}		}

define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {		define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
; X32-LABEL: legal_vzmovl_2f64_4f64:		; X32-LABEL: legal_vzmovl_2f64_4f64:
; X32: # %bb.0:		; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx		; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups (%ecx), %xmm0		; X32-NEXT: vmovdqu (%ecx), %xmm0
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1		; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; X32-NEXT: vmovdqa %ymm0, (%eax)
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper		; X32-NEXT: vzeroupper
; X32-NEXT: retl		; X32-NEXT: retl
;		;
; X64-LABEL: legal_vzmovl_2f64_4f64:		; X64-LABEL: legal_vzmovl_2f64_4f64:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %xmm0		; X64-NEXT: vmovdqu (%rdi), %xmm0
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1		; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; X64-NEXT: vmovdqa %ymm0, (%rsi)
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper		; X64-NEXT: vzeroupper
; X64-NEXT: retq		; X64-NEXT: retq
%ld = load <2 x double>, <2 x double>* %in, align 8		%ld = load <2 x double>, <2 x double>* %in, align 8
%ext = extractelement <2 x double> %ld, i64 0		%ext = extractelement <2 x double> %ld, i64 0
%ins = insertelement <4 x double> <double undef, double 0.0, double 0.0, double 0.0>, double %ext, i64 0		%ins = insertelement <4 x double> <double undef, double 0.0, double 0.0, double 0.0>, double %ext, i64 0
store <4 x double> %ins, <4 x double>* %out, align 32		store <4 x double> %ins, <4 x double>* %out, align 32
ret void		ret void
}		}

llvm/test/CodeGen/X86/vector-extend-inreg.ll

	Show First 20 Lines • Show All 65 Lines • ▼ Show 20 Lines
	;			;
	; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:			; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
	; X32-AVX: # %bb.0:			; X32-AVX: # %bb.0:
	; X32-AVX-NEXT: pushl %ebp			; X32-AVX-NEXT: pushl %ebp
	; X32-AVX-NEXT: movl %esp, %ebp			; X32-AVX-NEXT: movl %esp, %ebp
	; X32-AVX-NEXT: andl $-128, %esp			; X32-AVX-NEXT: andl $-128, %esp
	; X32-AVX-NEXT: subl $384, %esp # imm = 0x180			; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
	; X32-AVX-NEXT: movl 40(%ebp), %ecx			; X32-AVX-NEXT: movl 40(%ebp), %ecx
	; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0			; X32-AVX-NEXT: vpbroadcastq 32(%ebp), %ymm0
	; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1			; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
	; X32-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1			; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
	; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: vmovaps %ymm1, (%esp)			; X32-AVX-NEXT: vmovaps %ymm1, (%esp)
	; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)			; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp)
	; X32-AVX-NEXT: leal (%ecx,%ecx), %eax			; X32-AVX-NEXT: leal (%ecx,%ecx), %eax
	; X32-AVX-NEXT: andl $31, %eax			; X32-AVX-NEXT: andl $31, %eax
	; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax			; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax
	; X32-AVX-NEXT: leal 1(%ecx,%ecx), %ecx			; X32-AVX-NEXT: leal 1(%ecx,%ecx), %ecx
	; X32-AVX-NEXT: andl $31, %ecx			; X32-AVX-NEXT: andl $31, %ecx
	; X32-AVX-NEXT: movl (%esp,%ecx,4), %edx			; X32-AVX-NEXT: movl (%esp,%ecx,4), %edx
	; X32-AVX-NEXT: movl %ebp, %esp			; X32-AVX-NEXT: movl %ebp, %esp
	; X32-AVX-NEXT: popl %ebp			; X32-AVX-NEXT: popl %ebp
	; X32-AVX-NEXT: vzeroupper			; X32-AVX-NEXT: vzeroupper
	; X32-AVX-NEXT: retl			; X32-AVX-NEXT: retl
	;			;
	; X64-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:			; X64-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
	; X64-AVX: # %bb.0:			; X64-AVX: # %bb.0:
	; X64-AVX-NEXT: pushq %rbp			; X64-AVX-NEXT: pushq %rbp
	; X64-AVX-NEXT: movq %rsp, %rbp			; X64-AVX-NEXT: movq %rsp, %rbp
	; X64-AVX-NEXT: andq $-128, %rsp			; X64-AVX-NEXT: andq $-128, %rsp
	; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100			; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
	; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi			; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi
	; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3]			; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
	; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1			; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
	; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1			; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
	; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)			; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
	; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)			; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
	; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)			; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)
	; X64-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)			; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
	; X64-AVX-NEXT: andl $15, %edi			; X64-AVX-NEXT: andl $15, %edi
	; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax			; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax
	; X64-AVX-NEXT: movq %rbp, %rsp			; X64-AVX-NEXT: movq %rbp, %rsp
	; X64-AVX-NEXT: popq %rbp			; X64-AVX-NEXT: popq %rbp
	; X64-AVX-NEXT: vzeroupper			; X64-AVX-NEXT: vzeroupper
	; X64-AVX-NEXT: retq			; X64-AVX-NEXT: retq
	%1 = extractelement <16 x i64> %a0, i32 15			%1 = extractelement <16 x i64> %a0, i32 15
	%2 = insertelement <16 x i64> zeroinitializer, i64 %1, i32 4			%2 = insertelement <16 x i64> zeroinitializer, i64 %1, i32 4
	%3 = extractelement <16 x i64> %2, i32 %a1			%3 = extractelement <16 x i64> %2, i32 %a1
	ret i64 %3			ret i64 %3
	}			}

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

Show First 20 Lines • Show All 1,499 Lines • ▼ Show 20 Lines	; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>		%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x i64> %shuffle		ret <4 x i64> %shuffle
}		}

define <4 x double> @insert_reg_and_zero_v4f64(double %a) {		define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; ALL-LABEL: insert_reg_and_zero_v4f64:		; ALL-LABEL: insert_reg_and_zero_v4f64:
; ALL: # %bb.0:		; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0		; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1		; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; ALL-NEXT: retq		; ALL-NEXT: retq
%v = insertelement <4 x double> undef, double %a, i32 0		%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>		%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x double> %shuffle		ret <4 x double> %shuffle
}		}

define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {		define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
; ALL-LABEL: insert_mem_and_zero_v4f64:		; ALL-LABEL: insert_mem_and_zero_v4f64:
▲ Show 20 Lines • Show All 464 Lines • ▼ Show 20 Lines	entry:
%shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>		%shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
%add = add <4 x i64> %shuffle, %shuffle1		%add = add <4 x i64> %shuffle, %shuffle1
ret <4 x i64> %add		ret <4 x i64> %add
}		}

define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize {		define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_0zzz_optsize:		; ALL-LABEL: shuffle_v4f64_0zzz_optsize:
; ALL: # %bb.0:		; ALL: # %bb.0:
; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1		; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; ALL-NEXT: retq		; ALL-NEXT: retq
%b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>		%b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x double> %b		ret <4 x double> %b
}		}

define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize {		define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize {
; ALL-LABEL: shuffle_v4i64_0zzz_optsize:		; ALL-LABEL: shuffle_v4i64_0zzz_optsize:
; ALL: # %bb.0:		; ALL: # %bb.0:
; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1		; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; ALL-NEXT: retq		; ALL-NEXT: retq
%b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>		%b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x i64> %b		ret <4 x i64> %b
}		}

define <8 x float> @shuffle_v8f32_0zzzzzzz_optsize(<8 x float> %a) optsize {		define <8 x float> @shuffle_v8f32_0zzzzzzz_optsize(<8 x float> %a) optsize {
; ALL-LABEL: shuffle_v8f32_0zzzzzzz_optsize:		; ALL-LABEL: shuffle_v8f32_0zzzzzzz_optsize:
; ALL: # %bb.0:		; ALL: # %bb.0:
Show All 37 Lines

llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll

	Show First 20 Lines • Show All 1,967 Lines • ▼ Show 20 Lines
	; ALL-NEXT: ret{{[l\|q]}}			; ALL-NEXT: ret{{[l\|q]}}
	%1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 0, i32 1>			%1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 0, i32 1>
	ret <8 x double> %1			ret <8 x double> %1
	}			}

	define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) {			define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) {
	; ALL-LABEL: shuffle_v8i64_0zzzzzzz:			; ALL-LABEL: shuffle_v8i64_0zzzzzzz:
	; ALL: # %bb.0:			; ALL: # %bb.0:
	; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1			; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
	; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; ALL-NEXT: ret{{[l\|q]}}			; ALL-NEXT: ret{{[l\|q]}}
	%shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>			%shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <8 x i64> %shuffle			ret <8 x i64> %shuffle
	}			}

	define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) {			define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) {
	; ALL-LABEL: shuffle_v8f64_0zzzzzzz:			; ALL-LABEL: shuffle_v8f64_0zzzzzzz:
	; ALL: # %bb.0:			; ALL: # %bb.0:
	; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1			; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
	; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; ALL-NEXT: ret{{[l\|q]}}			; ALL-NEXT: ret{{[l\|q]}}
	%shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>			%shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <8 x double> %shuffle			ret <8 x double> %shuffle
	}			}

	define <8 x i64> @shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b) {			define <8 x i64> @shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b) {
	;			;
	; ALL-LABEL: shuffle_v8i64_12345678:			; ALL-LABEL: shuffle_v8i64_12345678:
	▲ Show 20 Lines • Show All 329 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

Show First 20 Lines • Show All 374 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret{{[l\|q]}}
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)		%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%3 = bitcast <32 x i8> %2 to <4 x i64>		%3 = bitcast <32 x i8> %2 to <4 x i64>
ret <4 x i64> %3		ret <4 x i64> %3
}		}

define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {		define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
; CHECK-LABEL: combine_pshufb_as_vzmovl_64:		; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
; CHECK: # %bb.0:		; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1		; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; CHECK-NEXT: ret{{[l\|q]}}		; CHECK-NEXT: ret{{[l\|q]}}
%1 = bitcast <4 x double> %a0 to <32 x i8>		%1 = bitcast <4 x double> %a0 to <32 x i8>
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)		%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%3 = bitcast <32 x i8> %2 to <4 x double>		%3 = bitcast <32 x i8> %2 to <4 x double>
ret <4 x double> %3		ret <4 x double> %3
}		}

define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {		define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
▲ Show 20 Lines • Show All 406 Lines • Show Last 20 Lines