This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX2] Enable ZERO_EXTEND_VECTOR_INREG lowering of 256-bit vectors
ClosedPublic

Authored by RKSimon on Oct 7 2018, 6:54 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel

Commits

rG6fc8d055656b: [X86][AVX2] Enable ZERO_EXTEND_VECTOR_INREG lowering of 256-bit vectors
rL343991: [X86][AVX2] Enable ZERO_EXTEND_VECTOR_INREG lowering of 256-bit vectors

Summary

Some necessary yak shaving before lowering *_EXTEND_VECTOR_INREG 256-bit vectors on AVX1 targets as suggested by D52964.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Oct 7 2018, 6:54 AM

RKSimon added inline comments.

test/CodeGen/X86/pr35443.ll
20	@craig.topper Please can you confirm if the pr35443.ll change is acceptable? An alternative is to set the passthrough value zeroinitializer, which instead adds a vpmovzxbq op after the vmovd (some kind of demanded bits failure that could be fixed in a future patch).

RKSimon mentioned this in D52980: [X86][AVX1] Enable *_EXTEND_VECTOR_INREG lowering of 256-bit vectors.Oct 8 2018, 4:12 AM

RKSimon added a child revision: D52980: [X86][AVX1] Enable *_EXTEND_VECTOR_INREG lowering of 256-bit vectors.Oct 8 2018, 4:15 AM

craig.topper added inline comments.Oct 8 2018, 10:34 AM

test/CodeGen/X86/pr35443.ll
20	What if you just change the alignment of @ac to 1? That should prevent the single byte load from the masked.load from promoting to a wider size I think.

Tweaked load alignment of test - the additional vpmovzxbq /should/ be removable with a suitable demandedelts+demandedbits combine (probably D52935 in reverse).

LGTM

This revision is now accepted and ready to land.Oct 8 2018, 11:26 AM

Closed by commit rL343991: [X86][AVX2] Enable ZERO_EXTEND_VECTOR_INREG lowering of 256-bit vectors (authored by RKSimon). · Explain WhyOct 8 2018, 11:42 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 343983)

12 lines

test/

CodeGen/

X86/

	avg.ll
	avg.ll (revision 343983)

101 lines

	pr35443.ll
	pr35443.ll (revision 343983)

3 lines

	vector-zext.ll
	vector-zext.ll (revision 343983)

14 lines

Diff 168688

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,119 Lines • ▼ Show 20 Lines	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);		setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);		setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);		setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);		setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);		setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
}		}

if (HasInt256) {		if (HasInt256) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);		for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);		setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);		setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
		}

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting		// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.		// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);		setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X		// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {		for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);		setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
▲ Show 20 Lines • Show All 18,583 Lines • ▼ Show 20 Lines	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
// For 512-bit vectors, we need 128-bits or 256-bits.		// For 512-bit vectors, we need 128-bits or 256-bits.
if (VT.getSizeInBits() > 128) {		if (VT.getSizeInBits() > 128) {
// Input needs to be at least the same number of elements as output, and		// Input needs to be at least the same number of elements as output, and
// at least 128-bits.		// at least 128-bits.
int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();		int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));		In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
}		}

assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|		// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still		// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
// need to be handled here for 256/512-bit results.		// need to be handled here for 256/512-bit results.
if (Subtarget.hasInt256()) {		if (Subtarget.hasInt256()) {
assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");		assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?		unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
X86ISD::VSEXT : X86ISD::VZEXT;		X86ISD::VSEXT : X86ISD::VZEXT;
return DAG.getNode(ExtOpc, dl, VT, In);		return DAG.getNode(ExtOpc, dl, VT, In);
}		}
▲ Show 20 Lines • Show All 21,819 Lines • Show Last 20 Lines

test/CodeGen/X86/avg.ll

	Show First 20 Lines • Show All 375 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
	; AVX1-NEXT: vmovdqu %xmm0, (%rax)			; AVX1-NEXT: vmovdqu %xmm0, (%rax)
	; AVX1-NEXT: vmovups %ymm1, (%rax)			; AVX1-NEXT: vmovups %ymm1, (%rax)
	; AVX1-NEXT: vzeroupper			; AVX1-NEXT: vzeroupper
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: avg_v48i8:			; AVX2-LABEL: avg_v48i8:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vmovdqa (%rdi), %ymm1			; AVX2-NEXT: vmovdqa (%rdi), %ymm0
	; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2			; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
	; AVX2-NEXT: vmovdqa (%rsi), %ymm3			; AVX2-NEXT: vmovdqa (%rsi), %ymm2
	; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0			; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4			; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
	; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero			; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,2,3]
	; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5			; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-NEXT: vpand %ymm9, %ymm5, %ymm5
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
				; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
				; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero			; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero			; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,2,3]
				; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
				; AVX2-NEXT: vpaddd %ymm4, %ymm5, %ymm4
				; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
				; AVX2-NEXT: vpaddd %ymm5, %ymm6, %ymm5
	; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6			; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero			; AVX2-NEXT: vpaddd %ymm2, %ymm7, %ymm2
	; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; AVX2-NEXT: vpand %ymm9, %ymm2, %ymm2
	; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
	; AVX2-NEXT: vpaddd %ymm4, %ymm7, %ymm4
	; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]			; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
	; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1			; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
	; AVX2-NEXT: vpaddd %ymm3, %ymm11, %ymm3
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
	; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0
	; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6			; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
	; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
	; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
	; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1			; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
	; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3			; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
	; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5			; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
	; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0			; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm7
	; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2			; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
				; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm0
	; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0			; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
				; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
				; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3
	; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5			; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
	; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
	; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
	; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4			; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
	; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6			; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
	; AVX2-NEXT: vpackusdw %xmm6, %xmm4, %xmm4			; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6
				; AVX2-NEXT: vpackusdw %xmm6, %xmm1, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>			; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
	; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
	; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1			; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]			; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
	; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4			; AVX2-NEXT: vpackusdw %xmm7, %xmm4, %xmm4
	; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2			; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
	; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2			; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4			; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4
	; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3			; AVX2-NEXT: vpackusdw %xmm4, %xmm5, %xmm4
				; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
				; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
				; AVX2-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
	; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3			; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]			; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1			; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
	; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2			; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
	; AVX2-NEXT: vpackusdw %xmm2, %xmm5, %xmm2			; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
	; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2			; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3			; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0			; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
	; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0			; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]			; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
	; AVX2-NEXT: vmovdqu %xmm0, (%rax)			; AVX2-NEXT: vmovdqu %xmm0, (%rax)
	; AVX2-NEXT: vmovdqu %ymm1, (%rax)			; AVX2-NEXT: vmovdqu %ymm1, (%rax)
	; AVX2-NEXT: vzeroupper			; AVX2-NEXT: vzeroupper
	▲ Show 20 Lines • Show All 2,466 Lines • Show Last 20 Lines

test/CodeGen/X86/pr35443.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx \| FileCheck %s

	@ac = external local_unnamed_addr global [20 x i8], align 16			@ac = external local_unnamed_addr global [20 x i8], align 1
	@ai3 = external local_unnamed_addr global [20 x i32], align 16			@ai3 = external local_unnamed_addr global [20 x i32], align 16

	; Function Attrs: norecurse nounwind uwtable			; Function Attrs: norecurse nounwind uwtable
	define void @pr35443() {			define void @pr35443() {
	; CHECK-LABEL: pr35443:			; CHECK-LABEL: pr35443:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: movzbl ac+{{.*}}(%rip), %eax			; CHECK-NEXT: movzbl ac+{{.*}}(%rip), %eax
	; CHECK-NEXT: vmovd %eax, %xmm0			; CHECK-NEXT: vmovd %eax, %xmm0
				; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1			; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; CHECK-NEXT: vpsubq %ymm0, %ymm1, %ymm0			; CHECK-NEXT: vpsubq %ymm0, %ymm1, %ymm0
	; CHECK-NEXT: vpmovqd %ymm0, ai3+{{.*}}(%rip)			; CHECK-NEXT: vpmovqd %ymm0, ai3+{{.*}}(%rip)
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	entry:			entry:
	%wide.masked.load66 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([20 x i8], [20 x i8]* @ac, i64 0, i64 4) to <4 x i8>*), i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> undef)			%wide.masked.load66 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([20 x i8], [20 x i8]* @ac, i64 0, i64 4) to <4 x i8>*), i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> undef)
				RKSimonAuthorUnsubmitted Not Done Reply Inline Actions @craig.topper Please can you confirm if the pr35443.ll change is acceptable? An alternative is to set the passthrough value zeroinitializer, which instead adds a vpmovzxbq op after the vmovd (some kind of demanded bits failure that could be fixed in a future patch). RKSimon: @craig.topper Please can you confirm if the pr35443.ll change is acceptable? An alternative is…
				craig.topperUnsubmitted Not Done Reply Inline Actions What if you just change the alignment of @ac to 1? That should prevent the single byte load from the masked.load from promoting to a wider size I think. craig.topper: What if you just change the alignment of @ac to 1? That should prevent the single byte load…
	%0 = zext <4 x i8> %wide.masked.load66 to <4 x i64>			%0 = zext <4 x i8> %wide.masked.load66 to <4 x i64>
	%1 = sub <4 x i64> zeroinitializer, %0			%1 = sub <4 x i64> zeroinitializer, %0
	%predphi = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 6, i32 7>			%predphi = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
	%2 = trunc <4 x i64> %predphi to <4 x i32>			%2 = trunc <4 x i64> %predphi to <4 x i32>
	%3 = add <4 x i32> zeroinitializer, %2			%3 = add <4 x i32> zeroinitializer, %2
	store <4 x i32> %3, <4 x i32>* bitcast (i32* getelementptr inbounds ([20 x i32], [20 x i32]* @ai3, i64 0, i64 4) to <4 x i32>*), align 16			store <4 x i32> %3, <4 x i32>* bitcast (i32* getelementptr inbounds ([20 x i32], [20 x i32]* @ai3, i64 0, i64 4) to <4 x i32>*), align 16
	ret void			ret void
	}			}

	; Function Attrs: argmemonly nounwind readonly			; Function Attrs: argmemonly nounwind readonly
	declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)			declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)

test/CodeGen/X86/vector-zext.ll

	Show First 20 Lines • Show All 2,232 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]			; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
	; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero			; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3			; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
	; AVX1-NEXT: vmovaps %ymm4, %ymm0			; AVX1-NEXT: vmovaps %ymm4, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: zext_32i8_to_32i32:			; AVX2-LABEL: zext_32i8_to_32i32:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
	; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
	; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm3
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
				; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
	; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero			; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
				; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
				; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2-NEXT: vmovdqa %ymm4, %ymm0			; AVX2-NEXT: vmovdqa %ymm4, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: zext_32i8_to_32i32:			; AVX512-LABEL: zext_32i8_to_32i32:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero			; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0			; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero			; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines