diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -126,6 +126,11 @@ cl::desc("enable quadword lock-free atomic operations"), cl::init(false), cl::Hidden); +static cl::opt + DisablePerfectShuffle("ppc-disable-perfect-shuffle", + cl::desc("disable vector permute decomposition"), + cl::init(false), cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); @@ -10071,56 +10076,59 @@ // perfect shuffle table to emit an optimal matching sequence. ArrayRef PermMask = SVOp->getMask(); - unsigned PFIndexes[4]; - bool isFourElementShuffle = true; - for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number - unsigned EltNo = 8; // Start out undef. - for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. - if (PermMask[i*4+j] < 0) - continue; // Undef, ignore it. - - unsigned ByteSource = PermMask[i*4+j]; - if ((ByteSource & 3) != j) { - isFourElementShuffle = false; - break; - } + if (!DisablePerfectShuffle && !isLittleEndian) { + unsigned PFIndexes[4]; + bool isFourElementShuffle = true; + for (unsigned i = 0; i != 4 && isFourElementShuffle; + ++i) { // Element number + unsigned EltNo = 8; // Start out undef. + for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. + if (PermMask[i * 4 + j] < 0) + continue; // Undef, ignore it. + + unsigned ByteSource = PermMask[i * 4 + j]; + if ((ByteSource & 3) != j) { + isFourElementShuffle = false; + break; + } - if (EltNo == 8) { - EltNo = ByteSource/4; - } else if (EltNo != ByteSource/4) { - isFourElementShuffle = false; - break; + if (EltNo == 8) { + EltNo = ByteSource / 4; + } else if (EltNo != ByteSource / 4) { + isFourElementShuffle = false; + break; + } } + PFIndexes[i] = EltNo; + } + + // If this shuffle can be expressed as a shuffle of 4-byte elements, use the + // perfect shuffle vector to determine if it is cost effective to do this as + // discrete instructions, or whether we should use a vperm. + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle) { + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + // Determining when to avoid vperm is tricky. Many things affect the cost + // of vperm, particularly how many times the perm mask needs to be + // computed. For example, if the perm mask can be hoisted out of a loop or + // is already used (perhaps because there are multiple permutes with the + // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the + // permute mask out of the loop requires an extra register. + // + // As a compromise, we only emit discrete instructions if the shuffle can + // be generated in 3 or fewer operations. When we have loop information + // available, if this block is within a loop, we should avoid using vperm + // for 3-operation perms and use a constant pool load instead. + if (Cost < 3) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } - PFIndexes[i] = EltNo; - } - - // If this shuffle can be expressed as a shuffle of 4-byte elements, use the - // perfect shuffle vector to determine if it is cost effective to do this as - // discrete instructions, or whether we should use a vperm. - // For now, we skip this for little endian until such time as we have a - // little-endian perfect shuffle table. - if (isFourElementShuffle && !isLittleEndian) { - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = - PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; - - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - // Determining when to avoid vperm is tricky. Many things affect the cost - // of vperm, particularly how many times the perm mask needs to be computed. - // For example, if the perm mask can be hoisted out of a loop or is already - // used (perhaps because there are multiple permutes with the same shuffle - // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of - // the loop requires an extra register. - // - // As a compromise, we only emit discrete instructions if the shuffle can be - // generated in 3 or fewer operations. When we have loop information - // available, if this block is within a loop, we should avoid using vperm - // for 3-operation perms and use a constant pool load instead. - if (Cost < 3) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant