Diff 23170

llvm/trunk/lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 7,162 Lines • ▼ Show 20 Lines

	let Predicates = [HasAVX2] in {			let Predicates = [HasAVX2] in {
	def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),			def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
	(v32i8 VR256:$src2))),			(v32i8 VR256:$src2))),
	(VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;			(VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
	}			}

	// Patterns			// Patterns
				// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
				// on targets where they have equal performance. These were changed to use
				// blends because blends have better throughput on SandyBridge and Haswell, but
				// movs[s/d] are 1-2 byte shorter instructions.
	let Predicates = [UseAVX] in {			let Predicates = [UseAVX] in {
	let AddedComplexity = 15 in {			let AddedComplexity = 15 in {
	// Move scalar to XMM zero-extended, zeroing a VR128 then do a			// Move scalar to XMM zero-extended, zeroing a VR128 then do a
	// MOVS{S,D} to the lower bits.			// MOVS{S,D} to the lower bits.
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),			def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
	(VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;			(VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
	def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),			def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
	(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;			(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),			def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
	(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;			(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),			def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
	(VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;			(VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;

	// Move low f32 and clear high bits.			// Move low f32 and clear high bits.
	def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),			def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
	(VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;			(VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
	def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
	(VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;			// Move low f64 and clear high bits.
				def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
				(VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
	}			}

	def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,			def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
	(v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),			(v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0),			(SUBREG_TO_REG (i32 0),
	(v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),			(v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
	sub_xmm)>;			sub_xmm)>;
	def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,			def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
	(v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),			(v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
	(SUBREG_TO_REG (i64 0),			(SUBREG_TO_REG (i64 0),
	(v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),			(v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
	sub_xmm)>;			sub_xmm)>;

	// Move low f64 and clear high bits.			// These will incur an FP/int domain crossing penalty, but it may be the only
	def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),			// way without AVX2. Do not add any complexity because we may be able to match
	(VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;			// more optimal patterns defined earlier in this file.
				def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
				(VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
	def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),			def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
	(VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;			(VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
	}			}

				// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
				// on targets where they have equal performance. These were changed to use
				// blends because blends have better throughput on SandyBridge and Haswell, but
				// movs[s/d] are 1-2 byte shorter instructions.
	let Predicates = [UseSSE41] in {			let Predicates = [UseSSE41] in {
	// With SSE41 we can use blends for these patterns.			// With SSE41 we can use blends for these patterns.
	def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),			def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
	(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;			(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),			def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
	(PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;			(PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
	def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),			def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
	(BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;			(BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
	▲ Show 20 Lines • Show All 1,661 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll

Show First 20 Lines • Show All 837 Lines • ▼ Show 20 Lines	; AVX2-NEXT: retq
%v = insertelement <4 x i64> undef, i64 %a, i64 0		%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>		%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x i64> %shuffle		ret <4 x i64> %shuffle
}		}

define <4 x double> @insert_reg_and_zero_v4f64(double %a) {		define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; ALL-LABEL: insert_reg_and_zero_v4f64:		; ALL-LABEL: insert_reg_and_zero_v4f64:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1		; ALL-NEXT: # kill: XMM0<def> XMM0<kill> YMM0<def>
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]		; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
		; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; ALL-NEXT: retq		; ALL-NEXT: retq
%v = insertelement <4 x double> undef, double %a, i32 0		%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>		%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x double> %shuffle		ret <4 x double> %shuffle
}		}

define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {		define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
; ALL-LABEL: insert_mem_and_zero_v4f64:		; ALL-LABEL: insert_mem_and_zero_v4f64:
▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll

	Show First 20 Lines • Show All 127 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]			; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]			; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8f32_70000000:			; AVX2-LABEL: shuffle_v8f32_70000000:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: movl $7, %eax			; AVX2-NEXT: movl $7, %eax
	; AVX2-NEXT: vmovd %eax, %xmm1			; AVX2-NEXT: vmovd %eax, %xmm1
	; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

	define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {			define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
	; ALL-LABEL: shuffle_v8f32_01014545:			; ALL-LABEL: shuffle_v8f32_01014545:
	▲ Show 20 Lines • Show All 811 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]			; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]			; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8i32_70000000:			; AVX2-LABEL: shuffle_v8i32_70000000:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: movl $7, %eax			; AVX2-NEXT: movl $7, %eax
	; AVX2-NEXT: vmovd %eax, %xmm1			; AVX2-NEXT: vmovd %eax, %xmm1
	; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x i32> %shuffle			ret <8 x i32> %shuffle
	}			}

	define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {			define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
	; AVX1-LABEL: shuffle_v8i32_01014545:			; AVX1-LABEL: shuffle_v8i32_01014545:
	▲ Show 20 Lines • Show All 1,135 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86, AVX] adjust tablegen patterns to generate better code for scalar insertion into zero vector (PR23073)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 23170

llvm/trunk/lib/Target/X86/X86InstrSSE.td

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86, AVX] adjust tablegen patterns to generate better code for scalar insertion into zero vector (PR23073)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 23170

llvm/trunk/lib/Target/X86/X86InstrSSE.td

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll

[X86, AVX] adjust tablegen patterns to generate better code for scalar insertion into zero vector (PR23073)
ClosedPublic