diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -932,14 +932,15 @@ } TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, - ArrayRef Mask) const { + ArrayRef Mask, Type *Ty, + int &Index, + VectorType *&SubTp) const { int Limit = Mask.size() * 2; if (Mask.empty() || // Extra check required by isSingleSourceMaskImpl function (called by // ShuffleVectorInst::isSingleSourceMask). any_of(Mask, [Limit](int I) { return I >= Limit; })) return Kind; - int Index; switch (Kind) { case TTI::SK_PermuteSingleSrc: if (ShuffleVectorInst::isReverseMask(Mask)) @@ -947,7 +948,13 @@ if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) return TTI::SK_Broadcast; break; - case TTI::SK_PermuteTwoSrc: + case TTI::SK_PermuteTwoSrc: { + int NumSubElts; + if (ShuffleVectorInst::isInsertSubvectorMask(Mask, Mask.size(), + NumSubElts, Index)) { + SubTp = FixedVectorType::get(Ty, NumSubElts); + return TTI::SK_InsertSubvector; + } if (ShuffleVectorInst::isSelectMask(Mask)) return TTI::SK_Select; if (ShuffleVectorInst::isTransposeMask(Mask)) @@ -955,6 +962,7 @@ if (ShuffleVectorInst::isSpliceMask(Mask, Index)) return TTI::SK_Splice; break; + } case TTI::SK_Select: case TTI::SK_Reverse: case TTI::SK_Broadcast: @@ -972,8 +980,8 @@ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = std::nullopt) { - - switch (improveShuffleKindFromMask(Kind, Mask)) { + switch (improveShuffleKindFromMask(Kind, Mask, Tp->getElementType(), Index, + SubTp)) { case TTI::SK_Broadcast: if (auto *FVT = dyn_cast(Tp)) return getBroadcastShuffleOverhead(FVT, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3568,7 +3568,8 @@ return Cost; } - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, Tp->getElementType(), Index, + SubTp); // Check for broadcast loads, which are supported by the LD1R instruction. // In terms of code-size, the shuffle vector is free when a load + dup get diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1114,7 +1114,9 @@ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, VT->getElementType(), Index, + SubTp); + if (ST->hasVOP3PInsts()) { if (cast(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1211,7 +1211,8 @@ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, Tp->getElementType(), Index, + SubTp); if (ST->hasNEON()) { if (Kind == TTI::SK_Broadcast) { static const CostTblEntry NEONDupTbl[] = { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -275,7 +275,8 @@ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, Tp->getElementType(), Index, + SubTp); std::pair LT = getTypeLegalizationCost(Tp); @@ -342,6 +343,35 @@ } break; } + case TTI::SK_Select: { + // We are going to permute multiple sources and the result will be in + // multiple destinations. Providing an accurate cost only for splits where + // the element type remains the same. + if (LT.first.isValid() && LT.first != 1 && + LT.second.isFixedLengthVector() && + LT.second.getVectorElementType().getSizeInBits() == + Tp->getElementType()->getPrimitiveSizeInBits() && + LT.second.getVectorNumElements() < + cast(Tp)->getNumElements()) { + unsigned NumRegs = *LT.first.getValue(); + unsigned VF = cast(Tp)->getNumElements(); + unsigned SubVF = (VF + NumRegs - 1) / NumRegs; + auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF); + + InstructionCost Cost = 0; + for (int I = 0, Sz = (VF + NumRegs - 1) / NumRegs; I < Sz; ++I) { + SmallVector SubMask(SubVF, PoisonMaskElem); + transform( + Mask.slice(I * SubVF, I == Sz - 1 ? Mask.size() % SubVF : SubVF), + SubMask.begin(), + [&](int I) { return ((I / VF == 0) ? 0 : 1) * SubVF + I % VF; }); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SubVecTy, SubMask, + CostKind, 0, nullptr); + return Cost; + } + } + break; + } } }; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -598,7 +598,8 @@ TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, Tp->getElementType(), Index, + SubTp); if (ST->hasVector()) { unsigned NumVectors = getNumVectorRegs(Tp); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1469,7 +1469,8 @@ // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = getTypeLegalizationCost(BaseTp); - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp->getElementType(), Index, + SubTp); // Treat Transpose as 2-op shuffles - there's no difference in lowering. if (Kind == TTI::SK_Transpose) diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -106,7 +106,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll @@ -16,8 +16,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> -; CHECK-NEXT: [[O31:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> -; CHECK-NEXT: ret <4 x half> [[O31]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x half> [[TMP8]] ; entry: %a0 = extractelement <4 x half> %in1, i64 0 @@ -49,20 +49,20 @@ define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2) { ; CHECK-LABEL: @phis_reverse( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> ; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] ; CHECK: bb0: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb1: ; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> -; CHECK-NEXT: [[O31:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> -; CHECK-NEXT: ret <4 x half> [[O31]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> +; CHECK-NEXT: ret <4 x half> [[TMP8]] ; entry: %a0 = extractelement <4 x half> %in1, i64 0 diff --git a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll --- a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll @@ -416,7 +416,7 @@ define i16 @reduceshuffle_twoin_uneven_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: @reduceshuffle_twoin_uneven_v16i16( -; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; CHECK-NEXT: [[X:%.*]] = xor <16 x i16> [[S]], ; CHECK-NEXT: [[R:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[X]]) ; CHECK-NEXT: ret i16 [[R]]