This is an archive of the discontinued LLVM Phabricator instance.

[X86] Improve codegen of v8i64->v8i16 and v16i32->v16i8 truncate with avx512vl, avx512bw, min-legal-vector-width<=256 and prefer-vector-width=256
ClosedPublic

Authored by craig.topper on Aug 1 2019, 6:15 PM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel

Commits

rGc49d3e6c4d3b: [X86] Improve codegen of v8i64->v8i16 and v16i32->v16i8 truncate with avx512vl…
rL368349: [X86] Improve codegen of v8i64->v8i16 and v16i32->v16i8 truncate with avx512vl…

Summary

Under this configuration we'll want to split the v8i64 or v16i32 into two vectors. The default legalization will try to truncate each of those 256-bit pieces one step to 128-bit, concatenate those, then truncate one more time from the new 256 to 128 bits.

With this patch we now truncate the two splits to 64-bits then concatenate those. We have to do this two different ways depending on whether have widening legalization enabled. Without widening legalization we have to manual construct X86ISD::VTRUNC to prevent the ISD::TRUNCATE with a narrow result being promoted to 128 bits with a larger element type than what we want followed by something like a pshufb to grab the lower half of each element to finish the job. With widening legalization we just get the right thing. When we switch to widening by default we can just delete the other code path.

Diff Detail

Event Timeline

craig.topper created this revision.Aug 1 2019, 6:15 PM

Herald added a project: Restricted Project. · View Herald TranscriptAug 1 2019, 6:15 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

Remove the special case for vector widening legalization

LGTM

This revision is now accepted and ready to land.Aug 8 2019, 1:09 PM

Closed by commit rL368349: [X86] Improve codegen of v8i64->v8i16 and v16i32->v16i8 truncate with avx512vl… (authored by ctopper). · Explain WhyAug 8 2019, 2:35 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

23 lines

test/

CodeGen/

X86/

min-legal-vector-width.ll

14 lines

Diff 213958

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,771 Lines • ▼ Show 20 Lines	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
if (Subtarget.hasVBMI2()) {		if (Subtarget.hasVBMI2()) {
// TODO: Make these legal even without VLX?		// TODO: Make these legal even without VLX?
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,		for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {		MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);		setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);		setOperationAction(ISD::FSHR, VT, Custom);
}		}
}		}

		setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
		setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
}		}

// We want to custom lower some of our intrinsics.		// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);		setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);		setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {		if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);		setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
▲ Show 20 Lines • Show All 17,255 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDValue In = Op.getOperand(0);		SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();		MVT InVT = In.getSimpleValueType();
unsigned InNumEltBits = InVT.getScalarSizeInBits();		unsigned InNumEltBits = InVT.getScalarSizeInBits();

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&		assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");		"Invalid TRUNCATE operation");

// If called by the legalizer just return.		// If called by the legalizer just return.
if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))		if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) {
		if ((InVT == MVT::v8i64 \|\| InVT == MVT::v16i32) && VT.is128BitVector()) {
		assert(Subtarget.hasVLX() && "Unexpected subtarget!");
		// The default behavior is to truncate one step, concatenate, and then
		// truncate the remainder. We'd rather produce two 64-bit results and
		// concatenate those.
		SDValue Lo, Hi;
		std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

		EVT LoVT, HiVT;
		std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

		Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
		Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
		return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
		}

		// Otherwise let default legalization handle it.
return SDValue();		return SDValue();
		}

if (VT.getVectorElementType() == MVT::i1)		if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);		return LowerTruncateVecI1(Op, DAG, Subtarget);

// vpmovqb/w/d, vpmovdb/w, vpmovwb		// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {		if (Subtarget.hasAVX512()) {
// word to byte only under BWI. Otherwise we have to promoted to v16i32		// word to byte only under BWI. Otherwise we have to promoted to v16i32
// and then truncate that. But we should only do that if we haven't been		// and then truncate that. But we should only do that if we haven't been
▲ Show 20 Lines • Show All 26,737 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/min-legal-vector-width.ll

	Show First 20 Lines • Show All 719 Lines • ▼ Show 20 Lines
	}			}
	declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)			declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)

	define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {			define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
	; CHECK-LABEL: trunc_v16i32_v16i8:			; CHECK-LABEL: trunc_v16i32_v16i8:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm0			; CHECK-NEXT: vmovdqa (%rdi), %ymm0
	; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1			; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
	; CHECK-NEXT: vpmovdw %ymm0, %xmm0			; CHECK-NEXT: vpmovdb %ymm1, %xmm1
	; CHECK-NEXT: vpmovdw %ymm1, %xmm1			; CHECK-NEXT: vpmovdb %ymm0, %xmm0
	; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; CHECK-NEXT: vpmovwb %ymm0, %xmm0
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%a = load <16 x i32>, <16 x i32>* %x			%a = load <16 x i32>, <16 x i32>* %x
	%b = trunc <16 x i32> %a to <16 x i8>			%b = trunc <16 x i32> %a to <16 x i8>
	ret <16 x i8> %b			ret <16 x i8> %b
	}			}

	define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {			define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
	; CHECK-LABEL: trunc_v8i64_v8i16:			; CHECK-LABEL: trunc_v8i64_v8i16:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: vmovdqa (%rdi), %ymm0			; CHECK-NEXT: vmovdqa (%rdi), %ymm0
	; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1			; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
	; CHECK-NEXT: vpmovqd %ymm0, %xmm0			; CHECK-NEXT: vpmovqw %ymm1, %xmm1
	; CHECK-NEXT: vpmovqd %ymm1, %xmm1			; CHECK-NEXT: vpmovqw %ymm0, %xmm0
	; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0			; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; CHECK-NEXT: vpmovdw %ymm0, %xmm0
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%a = load <8 x i64>, <8 x i64>* %x			%a = load <8 x i64>, <8 x i64>* %x
	%b = trunc <8 x i64> %a to <8 x i16>			%b = trunc <8 x i64> %a to <8 x i16>
	ret <8 x i16> %b			ret <8 x i16> %b
	}			}