diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3488,8 +3488,19 @@ Optional ShuffleKind = isShuffle(VL, Mask); if (ShuffleKind.hasValue()) { - InstructionCost Cost = - TTI->getShuffleCost(ShuffleKind.getValue(), VecTy, Mask); + InstructionCost Cost = 0; + + if (*ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc) + Cost += TTI->getShuffleCost(ShuffleKind.getValue(), VecTy, Mask); + + unsigned NumOfParts = TTI->getNumberOfParts(VecTy); + // Compute the number of number of elements per vector register for + // VecTy. If that is not possible, because the number of parts for VecTy + // is unknown, use the maximum value for unsigned. + unsigned EltsPerVector = + NumOfParts ? VecTy->getNumElements() / NumOfParts : -1; + unsigned Idx = 0; + for (auto *V : VL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -3502,6 +3513,26 @@ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, IO->getZExtValue()); } + if (*ShuffleKind == TargetTransformInfo::SK_PermuteSingleSrc) { + if ((Idx > 0 && (Idx + 1) % EltsPerVector == 0) || + Idx + 1 == VL.size()) { + SmallVector Mask; + unsigned StartIdx = + Idx - std::min(EltsPerVector, VecTy->getNumElements()) + 1; + Optional ShuffleKind = + isShuffle( + {&VL[StartIdx], + std::min(EltsPerVector, unsigned(VL.size() - StartIdx))}, + Mask); + Cost += TTI->getShuffleCost( + ShuffleKind.getValue(), + FixedVectorType::get( + VecTy->getElementType(), + std::min(EltsPerVector, VecTy->getNumElements())), + Mask); + } + } + ++Idx; } return ReuseShuffleCost + Cost; }