This is an archive of the discontinued LLVM Phabricator instance.

[X86] Add v32i8 shuffle lowering strategy to recognize two v4i64 vectors truncated to v4i8 and concatenated into the lower 8 bytes with undef/zero upper bytes.
ClosedPublic

Authored by craig.topper on Oct 2 2019, 11:18 PM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel

Commits

rG185ee6ec7cb3: [X86] Add v32i8 shuffle lowering strategy to recognize two v4i64 vectors…

Summary

This patch recognizes the shuffle pattern we get from a
v8i64->v8i8 truncate when v8i64 isn't a legal type.

With VLX we can use two VTRUNCS, unpckldq, and a insert_subvector.

Diff Detail

Event Timeline

craig.topper created this revision.Oct 2 2019, 11:18 PM

Herald added a project: Restricted Project. · View Herald TranscriptOct 2 2019, 11:18 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

Harbormaster completed remote builds in B38933: Diff 222969.Oct 2 2019, 11:18 PM

RKSimon added inline comments.Oct 3 2019, 4:00 AM

llvm/lib/Target/X86/X86ISelLowering.cpp
15539	isSequentialOrUndefInRange(Mask, 0, 8, 0, 8) ?
15542	Zeroable.extractBits(16, 8).isAllOnesValue() ?

craig.topper marked an inline comment as done.Oct 3 2019, 9:36 AM

craig.topper added inline comments.

llvm/lib/Target/X86/X86ISelLowering.cpp
15542	That should have been 8-32. I guess I had 24 bits in my head and wrote the wrong end.

Use simpler checks instead of loops.

craig.topper marked an inline comment as done.Oct 3 2019, 10:14 AM

craig.topper added inline comments.

llvm/lib/Target/X86/X86ISelLowering.cpp
15538	I went with an approach that relied less on having two magic numbers mentioned. I wrote its in terms of Mask.size() even though we know that's 32 so that only the 8 that was already use above was mentioned again.

LGTM - cheers

This revision is now accepted and ready to land.Oct 3 2019, 10:52 AM

craig.topper mentioned this in D68428: [X86] Add custom type legalization for v16i64->v16i8 truncate and v8i64->v8i8 truncate when v8i64 isn't legal.Oct 3 2019, 3:01 PM

r373645

craig.topper added a commit: rG185ee6ec7cb3: [X86] Add v32i8 shuffle lowering strategy to recognize two v4i64 vectors….Oct 4 2019, 10:18 AM

Diffusion mentioned this in rL373864: [X86] Add custom type legalization for v16i64->v16i8 truncate and v8i64->v8i8….Oct 6 2019, 11:41 AM

craig.topper mentioned this in rG570ae49d030c: [X86] Add custom type legalization for v16i64->v16i8 truncate and v8i64->v8i8….Oct 6 2019, 11:42 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

44 lines

test/

CodeGen/

X86/

min-legal-vector-width.ll

19 lines

shuffle-vs-trunc-512.ll

38 lines

Diff 223050

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 15,514 Lines • ▼ Show 20 Lines	if (ForceV1Zero)
V1 = getZeroVector(VT, Subtarget, DAG, DL);		V1 = getZeroVector(VT, Subtarget, DAG, DL);
if (ForceV2Zero)		if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);		V2 = getZeroVector(VT, Subtarget, DAG, DL);

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,		return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
DAG.getTargetConstant(Immediate, DL, MVT::i8));		DAG.getTargetConstant(Immediate, DL, MVT::i8));
}		}

		// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
		// by zeroable elements in the remaining 24 elements. Turn this into two
		// vmovqb instructions shuffled together.
		static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
		SDValue V1, SDValue V2,
		ArrayRef<int> Mask,
		const APInt &Zeroable,
		SelectionDAG &DAG) {
		assert(VT == MVT::v32i8 && "Unexpected type!");

		// The first 8 indices should be every 8th element.
		if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
		return SDValue();

		// Remaining elements need to be zeroable.
		if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
		craig.topperAuthorUnsubmitted Done Reply Inline Actions I went with an approach that relied less on having two magic numbers mentioned. I wrote its in terms of Mask.size() even though we know that's 32 so that only the 8 that was already use above was mentioned again. craig.topper: I went with an approach that relied less on having two magic numbers mentioned. I wrote its in…
		return SDValue();
		RKSimonUnsubmitted Not Done Reply Inline Actions isSequentialOrUndefInRange(Mask, 0, 8, 0, 8) ? RKSimon: isSequentialOrUndefInRange(Mask, 0, 8, 0, 8) ?

		V1 = DAG.getBitcast(MVT::v4i64, V1);
		V2 = DAG.getBitcast(MVT::v4i64, V2);
		RKSimonUnsubmitted Not Done Reply Inline Actions Zeroable.extractBits(16, 8).isAllOnesValue() ? RKSimon: Zeroable.extractBits(16, 8).isAllOnesValue() ?
		craig.topperAuthorUnsubmitted Done Reply Inline Actions That should have been 8-32. I guess I had 24 bits in my head and wrote the wrong end. craig.topper: That should have been 8-32. I guess I had 24 bits in my head and wrote the wrong end.

		V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
		V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

		// The VTRUNCS will put 0s in the upper 12 bytes. Use them to put zeroes in
		// the upper bits of the result using an unpckldq.
		SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
		{ 0, 1, 2, 3, 16, 17, 18, 19,
		4, 5, 6, 7, 20, 21, 22, 23 });
		// Insert the unpckldq into a zero vector to widen to v32i8.
		return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
		DAG.getConstant(0, DL, MVT::v32i8), Unpack,
		DAG.getIntPtrConstant(0, DL));
		}


/// Handle lowering of 4-lane 64-bit floating point shuffles.		/// Handle lowering of 4-lane 64-bit floating point shuffles.
///		///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2		/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.		/// isn't available.
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,		static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,		const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
▲ Show 20 Lines • Show All 584 Lines • ▼ Show 20 Lines	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Result;		return Result;

// Try to permute the lanes and then use a per-lane permute.		// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(		if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))		DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;		return V;

		// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
		// by zeroable elements in the remaining 24 elements. Turn this into two
		// vmovqb instructions shuffled together.
		if (Subtarget.hasVLX())
		if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
		Mask, Zeroable, DAG))
		return V;

// Otherwise fall back on generic lowering.		// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,		return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG);		Subtarget, DAG);
}		}

/// High-level routine to lower various 256-bit x86 vector shuffles.		/// High-level routine to lower various 256-bit x86 vector shuffles.
///		///
/// This routine either breaks down the specific type of a 256-bit x86 vector		/// This routine either breaks down the specific type of a 256-bit x86 vector
▲ Show 20 Lines • Show All 29,642 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/min-legal-vector-width.ll

Show First 20 Lines • Show All 825 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x		%a = load <16 x i32>, <16 x i32>* %x
%b = trunc <16 x i32> %a to <16 x i8>		%b = trunc <16 x i32> %a to <16 x i8>
ret <16 x i8> %b		ret <16 x i8> %b
}		}

define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {		define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-AVX512-LABEL: trunc_v8i64_v8i8:		; CHECK-AVX512-LABEL: trunc_v8i64_v8i8:
; CHECK-AVX512: # %bb.0:		; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vmovdqa (%rdi), %xmm0		; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
; CHECK-AVX512-NEXT: vmovdqa 16(%rdi), %xmm1		; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %xmm2		; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1
; CHECK-AVX512-NEXT: vmovdqa 48(%rdi), %xmm3		; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0
; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>		; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3		; CHECK-AVX512-NEXT: vzeroupper
; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; CHECK-AVX512-NEXT: retq		; CHECK-AVX512-NEXT: retq
;		;
; CHECK-VBMI-LABEL: trunc_v8i64_v8i8:		; CHECK-VBMI-LABEL: trunc_v8i64_v8i8:
; CHECK-VBMI: # %bb.0:		; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1		; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]		; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0		; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0		; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
▲ Show 20 Lines • Show All 253 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

	Show First 20 Lines • Show All 543 Lines • ▼ Show 20 Lines
	; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0			; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]			; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]			; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
	; AVX512F-NEXT: vmovq %xmm0, (%rsi)			; AVX512F-NEXT: vmovq %xmm0, (%rsi)
	; AVX512F-NEXT: retq			; AVX512F-NEXT: retq
	;			;
	; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:			; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
	; AVX512VL: # %bb.0:			; AVX512VL: # %bb.0:
	; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0			; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
	; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1			; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
	; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2			; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
	; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3			; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
	; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>			; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
	; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
	; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
	; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
	; AVX512VL-NEXT: vmovq %xmm0, (%rsi)			; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
				; AVX512VL-NEXT: vzeroupper
	; AVX512VL-NEXT: retq			; AVX512VL-NEXT: retq
	;			;
	; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:			; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
	; AVX512BW: # %bb.0:			; AVX512BW: # %bb.0:
	; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0			; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
	; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1			; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
	; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2			; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
	; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3			; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
	; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>			; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3			; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
	; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2			; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
	; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]			; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
	; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>			; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1			; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0			; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]			; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]			; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
	; AVX512BW-NEXT: vmovq %xmm0, (%rsi)			; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
	; AVX512BW-NEXT: retq			; AVX512BW-NEXT: retq
	;			;
	; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:			; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
	; AVX512BWVL: # %bb.0:			; AVX512BWVL: # %bb.0:
	; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0			; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
	; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1			; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
	; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2			; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
	; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3			; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
	; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>			; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
	; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
	; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
	; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
	; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)			; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
				; AVX512BWVL-NEXT: vzeroupper
	; AVX512BWVL-NEXT: retq			; AVX512BWVL-NEXT: retq
	;			;
	; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:			; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
	; AVX512VBMI: # %bb.0:			; AVX512VBMI: # %bb.0:
	; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0			; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
	; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1			; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
	; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2			; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
	; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3			; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
	▲ Show 20 Lines • Show All 298 Lines • Show Last 20 Lines