This is an archive of the discontinued LLVM Phabricator instance.

Differential D120072

[PowerPC] Add option to disable perfect shuffle
ClosedPublic

Authored by qiucf on Feb 17 2022, 10:01 AM.

Download Raw Diff

Details

Reviewers

jsji
nemanjai
shchenz

Group Reviewers

Restricted Project

Commits

rG43d48ed22029: [PowerPC] Add option to disable perfect shuffle

Summary

Perfect shuffle was introduced into PowerPC backend years ago, and only available in big-endian subtargets.

This optimization has good effects in simple cases, but brings serious negative impact in large programs with many shuffle instructions sharing the same mask. D116801 fixes the issue in those programs, but still causes performance degradation similar to disabling perfect shuffle.

So I propose introducing a temporary backend hidden option to control it until we implemented better way to fix the gap in vectorshuffle decomposition.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

qiucf created this revision.Feb 17 2022, 10:01 AM

Herald added subscribers: wenlei, steven.zhang, kbarton, hiraditya. · View Herald TranscriptFeb 17 2022, 10:01 AM

qiucf requested review of this revision.Feb 17 2022, 10:01 AM

Herald added a project: Restricted Project. · View Herald TranscriptFeb 17 2022, 10:01 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B150276: Diff 409699.Feb 17 2022, 10:47 AM

qiucf updated this revision to Diff 409858.Feb 17 2022, 11:08 PM

qiucf retitled this revision from [PowerPC] Disable perfect shuffle by default to [PowerPC] Add option to disable perfect shuffle.

qiucf edited the summary of this revision. (Show Details)

Harbormaster completed remote builds in B150379: Diff 409858.Feb 17 2022, 11:59 PM

LGTM. Thanks.

This revision is now accepted and ready to land.Feb 18 2022, 8:30 PM

This revision was landed with ongoing or failed builds.Feb 20 2022, 9:41 AM

Closed by commit rG43d48ed22029: [PowerPC] Add option to disable perfect shuffle (authored by qiucf). · Explain Why

This revision was automatically updated to reflect the committed changes.

qiucf added a commit: rG43d48ed22029: [PowerPC] Add option to disable perfect shuffle.

Revision Contents

Path

Size

llvm/

lib/

Target/

PowerPC/

PPCISelLowering.cpp

102 lines

Diff 410161

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",		static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);		cl::desc("use absolute jump tables on ppc"), cl::Hidden);

static cl::opt<bool> EnableQuadwordAtomics(		static cl::opt<bool> EnableQuadwordAtomics(
"ppc-quadword-atomics",		"ppc-quadword-atomics",
cl::desc("enable quadword lock-free atomic operations"), cl::init(false),		cl::desc("enable quadword lock-free atomic operations"), cl::init(false),
cl::Hidden);		cl::Hidden);

		static cl::opt<bool>
		DisablePerfectShuffle("ppc-disable-perfect-shuffle",
		cl::desc("disable vector permute decomposition"),
		cl::init(false), cl::Hidden);

STATISTIC(NumTailCalls, "Number of tail calls");		STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");		STATISTIC(NumSiblingCalls, "Number of sibling calls");
STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");		STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");		STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");

static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);		static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);

static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);		static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
▲ Show 20 Lines • Show All 9,929 Lines • ▼ Show 20 Lines	if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) \|\|
PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) \|\|		PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) \|\|
PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))		PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
return Op;		return Op;

// Check to see if this is a shuffle of 4-byte values. If so, we can use our		// Check to see if this is a shuffle of 4-byte values. If so, we can use our
// perfect shuffle table to emit an optimal matching sequence.		// perfect shuffle table to emit an optimal matching sequence.
ArrayRef<int> PermMask = SVOp->getMask();		ArrayRef<int> PermMask = SVOp->getMask();

		if (!DisablePerfectShuffle && !isLittleEndian) {
unsigned PFIndexes[4];		unsigned PFIndexes[4];
bool isFourElementShuffle = true;		bool isFourElementShuffle = true;
for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number		for (unsigned i = 0; i != 4 && isFourElementShuffle;
		++i) { // Element number
unsigned EltNo = 8; // Start out undef.		unsigned EltNo = 8; // Start out undef.
for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.		for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
if (PermMask[i*4+j] < 0)		if (PermMask[i * 4 + j] < 0)
continue; // Undef, ignore it.		continue; // Undef, ignore it.

unsigned ByteSource = PermMask[i*4+j];		unsigned ByteSource = PermMask[i * 4 + j];
if ((ByteSource & 3) != j) {		if ((ByteSource & 3) != j) {
isFourElementShuffle = false;		isFourElementShuffle = false;
break;		break;
}		}

if (EltNo == 8) {		if (EltNo == 8) {
EltNo = ByteSource/4;		EltNo = ByteSource / 4;
} else if (EltNo != ByteSource/4) {		} else if (EltNo != ByteSource / 4) {
isFourElementShuffle = false;		isFourElementShuffle = false;
break;		break;
}		}
}		}
PFIndexes[i] = EltNo;		PFIndexes[i] = EltNo;
}		}

// If this shuffle can be expressed as a shuffle of 4-byte elements, use the		// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
// perfect shuffle vector to determine if it is cost effective to do this as		// perfect shuffle vector to determine if it is cost effective to do this as
// discrete instructions, or whether we should use a vperm.		// discrete instructions, or whether we should use a vperm.
// For now, we skip this for little endian until such time as we have a		// For now, we skip this for little endian until such time as we have a
// little-endian perfect shuffle table.		// little-endian perfect shuffle table.
if (isFourElementShuffle && !isLittleEndian) {		if (isFourElementShuffle) {
// Compute the index in the perfect shuffle table.		// Compute the index in the perfect shuffle table.
unsigned PFTableIndex =		unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[0]999+PFIndexes[1]99+PFIndexes[2]9+PFIndexes[3];		PFIndexes[2] * 9 + PFIndexes[3];

unsigned PFEntry = PerfectShuffleTable[PFTableIndex];		unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);		unsigned Cost = (PFEntry >> 30);

// Determining when to avoid vperm is tricky. Many things affect the cost		// Determining when to avoid vperm is tricky. Many things affect the cost
// of vperm, particularly how many times the perm mask needs to be computed.		// of vperm, particularly how many times the perm mask needs to be
// For example, if the perm mask can be hoisted out of a loop or is already		// computed. For example, if the perm mask can be hoisted out of a loop or
// used (perhaps because there are multiple permutes with the same shuffle		// is already used (perhaps because there are multiple permutes with the
// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of		// same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the
// the loop requires an extra register.		// permute mask out of the loop requires an extra register.
//		//
// As a compromise, we only emit discrete instructions if the shuffle can be		// As a compromise, we only emit discrete instructions if the shuffle can
// generated in 3 or fewer operations. When we have loop information		// be generated in 3 or fewer operations. When we have loop information
// available, if this block is within a loop, we should avoid using vperm		// available, if this block is within a loop, we should avoid using vperm
// for 3-operation perms and use a constant pool load instead.		// for 3-operation perms and use a constant pool load instead.
if (Cost < 3)		if (Cost < 3)
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);		return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
}		}
		}

// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant		// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
// vector that will get spilled to the constant pool.		// vector that will get spilled to the constant pool.
if (V2.isUndef()) V2 = V1;		if (V2.isUndef()) V2 = V1;

// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except		// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
// that it is in input element units, not in bytes. Convert now.		// that it is in input element units, not in bytes. Convert now.

▲ Show 20 Lines • Show All 7,969 Lines • Show Last 20 Lines