This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles (PR39161)
ClosedPublic

Authored by RKSimon on Oct 11 2018, 10:24 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel
lebedev.ri
andreadb

Commits

rGf3952413f7fb: [X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles…
rL344446: [X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles…

Summary

Add shuffle lowering for the case where we can shuffle the lanes into place followed by an in-lane permute.

This is mainly for cases where we can have non-repeating permutes in each lane, but for now I've just enabled it for v4f64 unary shuffles to fix PR39161. There is not much test coverage for other shuffles that might benefit yet.

We now have several cross-lane shuffle lowering methods that all do something similar - I've looked at merging some of these (notably by making the repeated mask mechanism in lowerVectorShuffleByMerging128BitLanes optional), but there is a lot of assertions/assumptions in the way that makes this tricky - I ended up going for adding yet another relatively simple method instead.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Oct 11 2018, 10:24 AM

RKSimon mentioned this in rL344332: [X86][AVX] Add examples of shuffles that can be reduced to a cross-lane shuffle….Oct 12 2018, 3:29 AM

craig.topper added inline comments.Oct 12 2018, 12:53 PM

lib/Target/X86/X86ISelLowering.cpp
13445 ↗	(On Diff #169242)	Why do LaneMask and PermMask have the same actual size, but different "small" size?

RKSimon added inline comments.Oct 12 2018, 1:26 PM

lib/Target/X86/X86ISelLowering.cpp
13445 ↗	(On Diff #169242)	copy+paste has failed me again....

Fixed SmallVector<> LaneMask

LGTM

This revision is now accepted and ready to land.Oct 12 2018, 1:35 PM

Closed by commit rL344446: [X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles… (authored by RKSimon). · Explain WhyOct 13 2018, 4:41 AM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in rL344481: [X86][AVX] Enable lowerVectorShuffleAsLanePermuteAndPermute v16i16/v32i8….Oct 14 2018, 10:36 AM

RKSimon mentioned this in D58237: [X86][AVX] lowerShuffleAsLanePermuteAndPermute - fully populate the lane shuffle mask (PR40730).Feb 14 2019, 7:49 AM

RKSimon mentioned this in rL354117: [X86][AVX] lowerShuffleAsLanePermuteAndPermute - fully populate the lane….Feb 15 2019, 3:39 AM

RKSimon mentioned this in rG6ce08672fb4d: [X86][AVX] lowerShuffleAsLanePermuteAndPermute - fully populate the lane….

hans mentioned this in rL354260: Merging r354034 and r354117:.Feb 18 2019, 3:23 AM

hansw mentioned this in rGcc3d3f1f0762: Merging r354034 and r354117:.Feb 18 2019, 3:23 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

60 lines

test/

CodeGen/

X86/

vector-shuffle-256-v4.ll

16 lines

Diff 169558

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 13,425 Lines • ▼ Show 20 Lines	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);		return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

// Otherwise, just fall back to decomposed shuffles and a blend. This requires		// Otherwise, just fall back to decomposed shuffles and a blend. This requires
// that the decomposed single-input shuffles don't end up here.		// that the decomposed single-input shuffles don't end up here.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);		return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
}		}

/// Lower a vector shuffle crossing multiple 128-bit lanes as		/// Lower a vector shuffle crossing multiple 128-bit lanes as
		/// a lane permutation followed by a per-lane permutation.
		///
		/// This is mainly for cases where we can have non-repeating permutes
		/// in each lane.
		///
		/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
		/// we should investigate merging them.
		static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
		const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
		SelectionDAG &DAG, const X86Subtarget &Subtarget) {
		int NumElts = VT.getVectorNumElements();
		int NumLanes = VT.getSizeInBits() / 128;
		int NumEltsPerLane = NumElts / NumLanes;

		SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
		SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
		SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);

		for (int i = 0; i != NumElts; ++i) {
		int M = Mask[i];
		if (M < 0)
		continue;

		// Ensure that each lane comes from a single source lane.
		int SrcLane = M / NumEltsPerLane;
		int DstLane = i / NumEltsPerLane;
		if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
		return SDValue();
		SrcLaneMask[DstLane] = SrcLane;

		LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
		PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
		}

		// If we're only shuffling a single lowest lane and the rest are identity
		// then don't bother.
		// TODO - isShuffleMaskInputInPlace could be extended to something like this.
		int NumIdentityLanes = 0;
		bool OnlyShuffleLowestLane = true;
		for (int i = 0; i != NumLanes; ++i) {
		if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
		i * NumEltsPerLane))
		NumIdentityLanes++;
		else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
		OnlyShuffleLowestLane = false;
		}
		if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
		return SDValue();

		SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
		return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
		}

		/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a permutation and blend of those lanes.		/// a permutation and blend of those lanes.
///		///
/// This essentially blends the out-of-lane inputs to each lane into the lane		/// This essentially blends the out-of-lane inputs to each lane into the lane
/// from a permuted copy of the vector. This lowering strategy results in four		/// from a permuted copy of the vector. This lowering strategy results in four
/// instructions in the worst case for a single-input cross lane shuffle which		/// instructions in the worst case for a single-input cross lane shuffle which
/// is lower than any other fully general cross-lane shuffle strategy I'm aware		/// is lower than any other fully general cross-lane shuffle strategy I'm aware
/// of. Special cases for each particular shuffle pattern should be handled		/// of. Special cases for each particular shuffle pattern should be handled
/// prior to trying this lowering.		/// prior to trying this lowering.
▲ Show 20 Lines • Show All 719 Lines • ▼ Show 20 Lines	if (Subtarget.hasAVX2())
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));		getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

// Try to create an in-lane repeating shuffle mask and then shuffle the		// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.		// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(		if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;		return V;

		// Try to permute the lanes and then use a per-lane permute.
		if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
		DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
		return V;

// Otherwise, fall back.		// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,		return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
DAG, Subtarget);		DAG, Subtarget);
}		}

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =		if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))		lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
Show All 18 Lines	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// shuffle. However, if we have AVX2 and either inputs are already in place,		// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single		// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.		// instruction so skip this pattern.
if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|		if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
isShuffleMaskInputInPlace(1, Mask))))		isShuffleMaskInputInPlace(1, Mask))))
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(		if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Result;		return Result;

// If we have VLX support, we can use VEXPAND.		// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())		if (Subtarget.hasVLX())
if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,		if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
V1, V2, DAG, Subtarget))		V1, V2, DAG, Subtarget))
return V;		return V;

// If we have AVX2 then we always want to lower with a blend because an v4 we		// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.		// can fully permute the elements.
▲ Show 20 Lines • Show All 27,375 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll

Show First 20 Lines • Show All 85 Lines • ▼ Show 20 Lines
; AVX512VL-NEXT: retq		; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>		%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x double> %shuffle		ret <4 x double> %shuffle
}		}

define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {		define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_1000:		; AVX1-LABEL: shuffle_v4f64_1000:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]		; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]		; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: shuffle_v4f64_1000:		; AVX2-LABEL: shuffle_v4f64_1000:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]		; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512VL-LABEL: shuffle_v4f64_1000:		; AVX512VL-LABEL: shuffle_v4f64_1000:
▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	; AVX512VL-NEXT: retq
%tmp1 = bitcast <4 x i64> %b to <4 x double>		%tmp1 = bitcast <4 x i64> %b to <4 x double>
%shuffle = shufflevector <4 x double> %tmp0, <4 x double> %tmp1, <4 x i32> <i32 2, i32 2, i32 2, i32 2>		%shuffle = shufflevector <4 x double> %tmp0, <4 x double> %tmp1, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x double> %shuffle		ret <4 x double> %shuffle
}		}

define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {		define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_2233:		; AVX1-LABEL: shuffle_v4f64_2233:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0		; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]		; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: shuffle_v4f64_2233:		; AVX2-LABEL: shuffle_v4f64_2233:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]		; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512VL-LABEL: shuffle_v4f64_2233:		; AVX512VL-LABEL: shuffle_v4f64_2233:
▲ Show 20 Lines • Show All 572 Lines • ▼ Show 20 Lines
; AVX512VL-NEXT: retq		; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>		%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x i64> %shuffle		ret <4 x i64> %shuffle
}		}

define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {		define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1000:		; AVX1-LABEL: shuffle_v4i64_1000:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]		; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]		; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: shuffle_v4i64_1000:		; AVX2-LABEL: shuffle_v4i64_1000:
; AVX2: # %bb.0:		; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]		; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512VL-LABEL: shuffle_v4i64_1000:		; AVX512VL-LABEL: shuffle_v4i64_1000:
▲ Show 20 Lines • Show All 1,130 Lines • Show Last 20 Lines