This is an archive of the discontinued LLVM Phabricator instance.

[x86] Teach the target-specific combining how to aggressively fold half-shuffles, even looking through intervening instructions in a chain.
ClosedPublic

Authored by chandlerc on Jun 25 2014, 6:51 AM.

Download Raw Diff

Details

Reviewers

grosbach
filcab

Commits

rG0d6d1f2b1769: [x86] Teach the target-specific combining how to aggressively fold half…
rL211890: [x86] Teach the target-specific combining how to aggressively fold

Summary

This doesn't happen to show up with any test cases I've found for the current
shuffle lowering, but previous attempts would benefit from this and it seems
generally useful. I've tested it directly using intrinsics, which also shows
that it will work with hand vectorized code as well.

Note that even though pshufd isn't directly used in these tests, it gets
exercised because we combine some of the half shuffles into a pshufd
first, and then merge them.

Diff Detail

Repository: rL LLVM

Event Timeline

chandlerc updated this revision to Diff 10834.Jun 25 2014, 6:51 AM

chandlerc retitled this revision from to [x86] Teach the target-specific combining how to aggressively fold half-shuffles, even looking through intervening instructions in a chain..

chandlerc updated this object.

chandlerc edited the test plan for this revision. (Show Details)

chandlerc added reviewers: grosbach, filcab.

chandlerc added a subscriber: Unknown Object (MLST).

Nitty comments inline. Code looks great.

lib/Target/X86/X86ISelLowering.cpp
19002 ↗	(On Diff #10834)	grammar. "if the halves" maybe what you meant?
19006 ↗	(On Diff #10834)	"of it's input" sounds odd, but I'm not sure it's wrong. Can you rephrase?
19032 ↗	(On Diff #10834)	s/nodes/node's/
test/CodeGen/X86/vector-shuffle-combining.ll
11 ↗	(On Diff #10834)	I prefer not to look for these sorts of comments in test lines if at all possible and to just run llc with -asm-verbose=false. That enables explicit CHECK-NEXT for instructions right after the function label to make sure there's not extraneous stuff showing up, which looks like what you're trying to do here.

This revision is now accepted and ready to land.Jun 25 2014, 10:08 AM

LGTM

Generally, will apply fixes before submitting. One note though.

test/CodeGen/X86/vector-shuffle-combining.ll
11 ↗	(On Diff #10834)	I understand your hesitance, but with shuffles, I think we have to use comment-based checking. It is essential to write the shuffle instruction check lines using the comments that decode the operands rather than having opaque "$-27" and company in check lines.

Applying with suggested fixes generally. Thanks!

lib/Target/X86/X86ISelLowering.cpp
19002 ↗	(On Diff #10834)	No, its the rest of the sentence that had the wrong grammar. =]
19006 ↗	(On Diff #10834)	Old comment, the one above i think is closer despite the grammar fail. Nuked this one.

Closed by commit rL211890 (authored by @chandlerc).

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

90 lines

test/

CodeGen/

X86/

vector-shuffle-combining.ll

49 lines

Diff 10924

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,055 Lines • ▼ Show 20 Lines	case X86ISD::PSHUFHW:
for (int &M : Mask)		for (int &M : Mask)
M -= 4;		M -= 4;
return Mask;		return Mask;
default:		default:
llvm_unreachable("No valid shuffle instruction found!");		llvm_unreachable("No valid shuffle instruction found!");
}		}
}		}

		/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
		///
		/// We walk up the chain, skipping shuffles of the other half and looking
		/// through shuffles which switch halves trying to find a shuffle of the same
		/// pair of dwords.
		static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
		SelectionDAG &DAG,
		TargetLowering::DAGCombinerInfo &DCI) {
		assert(
		(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&
		"Called with something other than an x86 128-bit half shuffle!");
		SDLoc DL(N);
		unsigned CombineOpcode = N.getOpcode();

		// Walk up a single-use chain looking for a combinable shuffle.
		SDValue V = N.getOperand(0);
		for (; V.hasOneUse(); V = V.getOperand(0)) {
		switch (V.getOpcode()) {
		default:
		return false; // Nothing combined!

		case ISD::BITCAST:
		// Skip bitcasts as we always know the type for the target specific
		// instructions.
		continue;

		case X86ISD::PSHUFLW:
		case X86ISD::PSHUFHW:
		if (V.getOpcode() == CombineOpcode)
		break;

		// Other-half shuffles are no-ops.
		continue;

		case X86ISD::PSHUFD: {
		// We can only handle pshufd if the half we are combining either stays in
		// its half, or switches to the other half. Bail if one of these isn't
		// true.
		SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
		int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
		if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) \|\|
		(VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
		return false;

		// Map the mask through the pshufd and keep walking up the chain.
		for (int i = 0; i < 4; ++i)
		Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;

		// Switch halves if the pshufd does.
		CombineOpcode =
		VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
		continue;
		}
		}
		// Break out of the loop if we break out of the switch.
		break;
		}

		if (!V.hasOneUse())
		// We fell out of the loop without finding a viable combining instruction.
		return false;

		// Record the old value to use in RAUW-ing.
		SDValue Old = V;

		// Merge this node's mask and our incoming mask (adjusted to account for all
		// the pshufd instructions encountered).
		SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
		for (int &M : Mask)
		M = VMask[M];
		V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
		getV4X86ShuffleImm8ForMask(Mask, DAG));

		// Replace N with its operand as we're going to combine that shuffle away.
		DAG.ReplaceAllUsesWith(N, N.getOperand(0));

		// Replace the combinable shuffle with the combined one, updating all users
		// so that we re-evaluate the chain here.
		DCI.CombineTo(Old.getNode(), V, /AddTo/ true);
		return true;
		}

/// \brief Try to combine x86 target specific shuffles.		/// \brief Try to combine x86 target specific shuffles.
static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,		static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
SDLoc DL(N);		SDLoc DL(N);
MVT VT = N.getSimpleValueType();		MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;		SmallVector<int, 4> Mask;

switch (N.getOpcode()) {		switch (N.getOpcode()) {
case X86ISD::PSHUFD:		case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:		case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:		case X86ISD::PSHUFHW:
Mask = getPSHUFShuffleMask(N);		Mask = getPSHUFShuffleMask(N);
assert(Mask.size() == 4);		assert(Mask.size() == 4);
break;		break;
default:		default:
return SDValue();		return SDValue();
}		}

		// Nuke no-op shuffles that show up after combining.
		if (isNoopShuffleMask(Mask))
		return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

		// Look for simplifications involving one or two shuffle instructions.
SDValue V = N.getOperand(0);		SDValue V = N.getOperand(0);
switch (N.getOpcode()) {		switch (N.getOpcode()) {
default:		default:
break;		break;
case X86ISD::PSHUFLW:		case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:		case X86ISD::PSHUFHW:
assert(VT == MVT::v8i16);		assert(VT == MVT::v8i16);

		if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
		return SDValue(); // We combined away this shuffle, so we're done.

// See if this reduces to a PSHUFD which is no more expensive and can		// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations.		// combine with more operations.
if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&		if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
areAdjacentMasksSequential(Mask)) {		areAdjacentMasksSequential(Mask)) {
int DMask[] = {-1, -1, -1, -1};		int DMask[] = {-1, -1, -1, -1};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;		int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + Mask[0] / 2;		DMask[DOffset + 0] = DOffset + Mask[0] / 2;
DMask[DOffset + 1] = DOffset + Mask[2] / 2;		DMask[DOffset + 1] = DOffset + Mask[2] / 2;
▲ Show 20 Lines • Show All 4,093 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll

				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering \| FileCheck %s --check-prefix=CHECK-SSE2

				target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-unknown"

				declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
				declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)

				define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
				; CHECK-SSE2-LABEL: @combine_pshuflw1
				; CHECK-SSE2: # BB#0:
				; CHECK-SSE2-NEXT: retq
				%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
				%c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
				ret <8 x i16> %c
				}

				define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
				; CHECK-SSE2-LABEL: @combine_pshuflw2
				; CHECK-SSE2: # BB#0:
				; CHECK-SSE2-NEXT: retq
				%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
				%c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
				%d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
				ret <8 x i16> %d
				}

				define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
				; CHECK-SSE2-LABEL: @combine_pshuflw3
				; CHECK-SSE2: # BB#0:
				; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
				; CHECK-SSE2-NEXT: retq
				%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
				%c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
				%d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
				ret <8 x i16> %d
				}

				define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
				; CHECK-SSE2-LABEL: @combine_pshufhw1
				; CHECK-SSE2: # BB#0:
				; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
				; CHECK-SSE2-NEXT: retq
				%b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
				%c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
				%d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
				ret <8 x i16> %d
				}