This is an archive of the discontinued LLVM Phabricator instance.

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 8aee96e1c504..4dca5490b26f 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -4659,6 +4659,8 @@ let Predicates = [HasAVX512] in {
 let Predicates = [HasFP16] in {
   def : Pat<(v8f16 (X86vzmovl (v8f16 VR128X:$src))),
             (VMOVSHZrr (v8f16 (AVX512_128_SET0)), VR128X:$src)>;
+  def : Pat<(v8i16 (X86vzmovl (v8i16 VR128X:$src))),
+            (VMOVSHZrr (v8i16 (AVX512_128_SET0)), VR128X:$src)>;

   // FIXME we need better canonicalization in dag combine
   def : Pat<(v16f16 (X86vzmovl (v16f16 VR256X:$src))),

Harbormaster completed remote builds in B133690: Diff 386472.Nov 11 2021, 5:01 AM

Enable another use case.

In D113661#3124238, @LuoYuanke wrote:

Should we support VZEXT_MOVL for i16 with VMOVSHZrr just like what we support VZEXT_MOVL for i32?

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 8aee96e1c504..4dca5490b26f 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -4659,6 +4659,8 @@ let Predicates = [HasAVX512] in {
 let Predicates = [HasFP16] in {
   def : Pat<(v8f16 (X86vzmovl (v8f16 VR128X:$src))),
             (VMOVSHZrr (v8f16 (AVX512_128_SET0)), VR128X:$src)>;
+  def : Pat<(v8i16 (X86vzmovl (v8i16 VR128X:$src))),
+            (VMOVSHZrr (v8i16 (AVX512_128_SET0)), VR128X:$src)>;

   // FIXME we need better canonicalization in dag combine
   def : Pat<(v16f16 (X86vzmovl (v16f16 VR256X:$src))),

I think it's OK for now since we only have these 2 cases for i16 that may create a VZEXT_MOVL node. If we adding pattern, we also need to duplicate for v16i16 and v32i16.

LGTM, thanks.

Harbormaster completed remote builds in B133707: Diff 386492.Nov 11 2021, 6:45 AM

Closed by commit rG74b979abcd0f: [X86][FP16] Avoid to generate VZEXT_MOVL with i16 (authored by pengfei). · Explain WhyNov 11 2021, 5:32 PM

This revision was automatically updated to reflect the committed changes.

pengfei added a commit: rG74b979abcd0f: [X86][FP16] Avoid to generate VZEXT_MOVL with i16.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

12 lines

test/

CodeGen/

X86/

avx512fp16-mov.ll

29 lines

Diff 386698

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 36,094 Lines • ▼ Show 20 Lines	static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.		// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
if (Mask[0] == 0 &&		if (Mask[0] == 0 &&
(MaskEltSize == 32 \|\| (MaskEltSize == 16 && Subtarget.hasFP16()))) {		(MaskEltSize == 32 \|\| (MaskEltSize == 16 && Subtarget.hasFP16()))) {
if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) \|\|		if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) \|\|
(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&		(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {		isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
Shuffle = X86ISD::VZEXT_MOVL;		Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT =		SrcVT = DstVT = MaskEltSize == 16 ? MVT::v8f16
!Subtarget.hasSSE2() && MaskEltSize == 32 ? MVT::v4f32 : MaskVT;		: !Subtarget.hasSSE2() ? MVT::v4f32
		: MaskVT;
return true;		return true;
}		}
}		}

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.		// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).		// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|		if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {		(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
Show All 27 Lines	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);		DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);		DstVT = MVT::getVectorVT(DstVT, NumDstElts);
return true;		return true;
}		}
}		}
}		}

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).		// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&		if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2()) \|\|
		(MaskEltSize == 16 && Subtarget.hasFP16())) &&
isUndefOrEqual(Mask[0], 0) &&		isUndefOrEqual(Mask[0], 0) &&
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {		isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
Shuffle = X86ISD::VZEXT_MOVL;		Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;		SrcVT = DstVT = MaskEltSize == 16 ? MVT::v8f16
		: !Subtarget.hasSSE2() ? MVT::v4f32
		: MaskVT;
return true;		return true;
}		}

// Check if we have SSE3 which will let us use MOVDDUP etc. The		// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to		// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.		// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {		if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {		if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
▲ Show 20 Lines • Show All 18,092 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/avx512fp16-mov.ll

	Show First 20 Lines • Show All 1,920 Lines • ▼ Show 20 Lines
	; X86-NEXT: vmovlps %xmm0, (%eax)			; X86-NEXT: vmovlps %xmm0, (%eax)
	; X86-NEXT: retl			; X86-NEXT: retl
	%a = load <4 x half>, <4 x half>* %x			%a = load <4 x half>, <4 x half>* %x
	%b = load <4 x half>, <4 x half>* %y			%b = load <4 x half>, <4 x half>* %y
	%c = fadd <4 x half> %a, %b			%c = fadd <4 x half> %a, %b
	store <4 x half> %c, <4 x half>* %z			store <4 x half> %c, <4 x half>* %z
	ret void			ret void
	}			}

				define <8 x half> @test21(half %a, half %b, half %c) nounwind {
				; X64-LABEL: test21:
				; X64: # %bb.0:
				; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
				; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2
				; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
				; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
				; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; X64-NEXT: vpbroadcastw %xmm1, %xmm1
				; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
				; X64-NEXT: retq
				;
				; X86-LABEL: test21:
				; X86: # %bb.0:
				; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
				; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
				; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
				; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
				; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
				; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; X86-NEXT: vpbroadcastw %xmm1, %xmm1
				; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
				; X86-NEXT: retl
				%1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0
				%2 = insertelement <8 x half> %1, half %b, i32 1
				%3 = insertelement <8 x half> %2, half %c, i32 2
				ret <8 x half> %3
				}

This is an archive of the discontinued LLVM Phabricator instance.

[X86][FP16] Avoid to generate VZEXT_MOVL with i16ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 386698

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/avx512fp16-mov.ll

[X86][FP16] Avoid to generate VZEXT_MOVL with i16
ClosedPublic