diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -787,11 +787,43 @@ return OpCost; } + TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, + ArrayRef Mask) const { + int Limit = Mask.size() * 2; + if (Mask.empty() || + // Extra check required by isSingleSourceMaskImpl function (called by + // ShuffleVectorInst::isSingleSourceMask). + any_of(Mask, [Limit](int I) { return I >= Limit; })) + return Kind; + switch (Kind) { + case TTI::SK_PermuteSingleSrc: + if (ShuffleVectorInst::isReverseMask(Mask)) + return TTI::SK_Reverse; + if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) + return TTI::SK_Broadcast; + break; + case TTI::SK_PermuteTwoSrc: + if (ShuffleVectorInst::isSelectMask(Mask)) + return TTI::SK_Select; + if (ShuffleVectorInst::isTransposeMask(Mask)) + return TTI::SK_Transpose; + break; + case TTI::SK_Select: + case TTI::SK_Reverse: + case TTI::SK_Broadcast: + case TTI::SK_Transpose: + case TTI::SK_InsertSubvector: + case TTI::SK_ExtractSubvector: + break; + } + return Kind; + } + InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp) { - switch (Kind) { + switch (improveShuffleKindFromMask(Kind, Mask)) { case TTI::SK_Broadcast: return getBroadcastShuffleOverhead(cast(Tp)); case TTI::SK_Select: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1620,6 +1620,7 @@ VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp) { + Kind = improveShuffleKindFromMask(Kind, Mask); if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_Reverse) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1138,6 +1138,7 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, ArrayRef Mask, int Index, VectorType *SubTp) { + Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVOP3PInsts()) { if (cast(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1139,6 +1139,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp) { + Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasNEON()) { if (Kind == TTI::SK_Broadcast) { static const CostTblEntry NEONDupTbl[] = { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -559,6 +559,7 @@ VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp) { + Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVector()) { unsigned NumVectors = getNumVectorRegs(Tp); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -968,6 +968,7 @@ // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); + Kind = improveShuffleKindFromMask(Kind, Mask); // Treat Transpose as 2-op shuffles - there's no difference in lowering. if (Kind == TTI::SK_Transpose) Kind = TTI::SK_PermuteTwoSrc; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4352,22 +4352,14 @@ } int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V); Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane; - // Extra check required by isSingleSourceMaskImpl function (called by - // ShuffleVectorInst::isSingleSourceMask). - if (Mask[I] >= 2 * E) - return None; } - if (Entries.size() == 1) { - if (ShuffleVectorInst::isReverseMask(Mask)) - return TargetTransformInfo::SK_Reverse; + switch (Entries.size()) { + case 1: return TargetTransformInfo::SK_PermuteSingleSrc; - } - if (Entries.size() == 2) { - if (ShuffleVectorInst::isSelectMask(Mask)) - return TargetTransformInfo::SK_Select; - if (ShuffleVectorInst::isTransposeMask(Mask)) - return TargetTransformInfo::SK_Transpose; + case 2: return TargetTransformInfo::SK_PermuteTwoSrc; + default: + break; } return None; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -45,7 +45,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-1' +; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4'