diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10040,56 +10040,87 @@ // perfect shuffle table to emit an optimal matching sequence. ArrayRef PermMask = SVOp->getMask(); - unsigned PFIndexes[4]; - bool isFourElementShuffle = true; - for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number - unsigned EltNo = 8; // Start out undef. - for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. - if (PermMask[i*4+j] < 0) - continue; // Undef, ignore it. - - unsigned ByteSource = PermMask[i*4+j]; - if ((ByteSource & 3) != j) { - isFourElementShuffle = false; - break; + // Record all shuffle masks in the DAG. The first integer in value part is + // occurance of the mask, and the second means whether it occured multiple + // times, because the integer will decrement and all entries should be removed + // after visiting current DAG. + static DenseMap, int> MaskMap; + static const SDNode *CurDAGRoot = DAG.getRoot().getNode(); + + if (CurDAGRoot != DAG.getRoot().getNode()) { + CurDAGRoot = DAG.getRoot().getNode(); + MaskMap.clear(); + } + + if (MaskMap.find(PermMask) == MaskMap.end()) { + for (const SDNode &Node : DAG.allnodes()) { + if (const auto *Shuffle = dyn_cast(&Node)) { + ArrayRef TheMask = Shuffle->getMask(); + if (MaskMap.find(TheMask) == MaskMap.end()) + MaskMap.insert(std::make_pair(TheMask, 0)); + MaskMap[TheMask] += 1; } + } + } - if (EltNo == 8) { - EltNo = ByteSource/4; - } else if (EltNo != ByteSource/4) { - isFourElementShuffle = false; - break; + // When there're multiple shuffles sharing the same mask, we think it's + // cheaper to use load+vperm. + if (MaskMap[PermMask] == 1) { + unsigned PFIndexes[4]; + bool isFourElementShuffle = true; + for (unsigned i = 0; i != 4 && isFourElementShuffle; + ++i) { // Element number + unsigned EltNo = 8; // Start out undef. + for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. + if (PermMask[i * 4 + j] < 0) + continue; // Undef, ignore it. + + unsigned ByteSource = PermMask[i * 4 + j]; + if ((ByteSource & 3) != j) { + isFourElementShuffle = false; + break; + } + + if (EltNo == 8) { + EltNo = ByteSource / 4; + } else if (EltNo != ByteSource / 4) { + isFourElementShuffle = false; + break; + } } + PFIndexes[i] = EltNo; + } + + // Remove entry of this mask from map since it's unique in current DAG. + MaskMap.erase(PermMask); + + // If this shuffle can be expressed as a shuffle of 4-byte elements, use the + // perfect shuffle vector to determine if it is cost effective to do this as + // discrete instructions, or whether we should use a vperm. + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle && !isLittleEndian) { + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + // Determining when to avoid vperm is tricky. Many things affect the cost + // of vperm, particularly how many times the perm mask needs to be + // computed. For example, if the perm mask can be hoisted out of a loop or + // is already used (perhaps because there are multiple permutes with the + // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the + // permute mask out of the loop requires an extra register. + // + // As a compromise, we only emit discrete instructions if the shuffle can + // be generated in 3 or fewer operations. When we have loop information + // available, if this block is within a loop, we should avoid using vperm + // for 3-operation perms and use a constant pool load instead. + if (Cost < 3) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } - PFIndexes[i] = EltNo; - } - - // If this shuffle can be expressed as a shuffle of 4-byte elements, use the - // perfect shuffle vector to determine if it is cost effective to do this as - // discrete instructions, or whether we should use a vperm. - // For now, we skip this for little endian until such time as we have a - // little-endian perfect shuffle table. - if (isFourElementShuffle && !isLittleEndian) { - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = - PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; - - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - // Determining when to avoid vperm is tricky. Many things affect the cost - // of vperm, particularly how many times the perm mask needs to be computed. - // For example, if the perm mask can be hoisted out of a loop or is already - // used (perhaps because there are multiple permutes with the same shuffle - // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of - // the loop requires an extra register. - // - // As a compromise, we only emit discrete instructions if the shuffle can be - // generated in 3 or fewer operations. When we have loop information - // available, if this block is within a loop, we should avoid using vperm - // for 3-operation perms and use a constant pool load instead. - if (Cost < 3) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant diff --git a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll --- a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll @@ -38,12 +38,11 @@ define <4 x float> @shuffle3(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x i8> %v4) { ; BE-LABEL: shuffle3: ; BE: # %bb.0: -; BE-NEXT: vmrglw 0, 2, 3 -; BE-NEXT: vmrghw 2, 2, 3 -; BE-NEXT: vmrglw 3, 4, 5 -; BE-NEXT: vmrghw 4, 4, 5 -; BE-NEXT: vmrghw 2, 2, 0 -; BE-NEXT: vmrghw 3, 4, 3 +; BE-NEXT: addis 3, 2, .LCPI2_0@toc@ha +; BE-NEXT: addi 3, 3, .LCPI2_0@toc@l +; BE-NEXT: lxv 32, 0(3) +; BE-NEXT: vperm 2, 2, 3, 0 +; BE-NEXT: vperm 3, 4, 5, 0 ; BE-NEXT: xvaddsp 34, 34, 35 ; BE-NEXT: blr ;