Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2613,6 +2613,72 @@ ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args) { + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + // If we have a Mask, and the LT is being legalized somehow, split the Mask + // into smaller vectors and sum the cost of each shuffle. + if (!Mask.empty() && isa(Tp) && + Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && + Tp->getElementCount() != LT.second.getVectorElementCount() && !Index && + !SubTp) { + unsigned TpNumElts = cast(Tp)->getNumElements(); + assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); + unsigned LTNumElts = LT.second.getVectorNumElements(); + unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; + VectorType *NTp = + VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); + InstructionCost Cost; + for (unsigned N = 0; N < NumVecs; N++) { + SmallVector NMask; + // Split the existing mask into chunks of size LTNumElts. Track the source + // sub-vectors to ensure the result has at most 2 inputs. + unsigned Source1, Source2; + unsigned NumSources = 0; + for (unsigned E = 0; E < LTNumElts; E++) { + int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] + : UndefMaskElem; + if (MaskElt < 0) { + NMask.push_back(UndefMaskElem); + continue; + } + + // Calculate which source from the input this comes from and whether it + // is new to us. + unsigned Source = MaskElt / LTNumElts; + if (NumSources == 0) { + Source1 = Source; + NumSources = 1; + } else if (NumSources == 1 && Source != Source1) { + Source2 = Source; + NumSources = 2; + } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { + NumSources++; + } + + // Add to the new mask. For the NumSources>2 case these are not correct, + // but are only used for the modular lane number. + if (Source == Source1) + NMask.push_back(MaskElt % LTNumElts); + else if (Source == Source2) + NMask.push_back(MaskElt % LTNumElts + LTNumElts); + else + NMask.push_back(MaskElt % LTNumElts); + } + // If the sub-mask has at most 2 input sub-vectors then re-cost it using + // getShuffleCost. If not then cost it using the worst case. + if (NumSources <= 2) + Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc + : TTI::SK_PermuteTwoSrc, + NTp, NMask, 0, nullptr, Args); + else if (any_of(enumerate(NMask), [&](const auto &ME) { + return ME.value() % LTNumElts == ME.index(); + })) + Cost += LTNumElts - 1; + else + Cost += LTNumElts; + } + return Cost; + } + // If we have 4 elements for the shuffle and a Mask, get the cost straight // from the perfect shuffle tables. if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && @@ -2621,7 +2687,6 @@ return getPerfectShuffleCost(Mask); Kind = improveShuffleKindFromMask(Kind, Mask); - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_Reverse) { Index: llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -101,19 +101,19 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> @@ -166,18 +166,18 @@ ; CHECK-LABEL: 'multipart' ; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16b = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v64d = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64d = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64ab = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32>