diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -946,9 +946,14 @@ } /// Shuffles \p Mask in accordance with the given \p SubMask. -static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { +/// \param ExtendingManyInputs Supports reshuffling of the mask with not only +/// one but two input vectors. +static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask, + bool ExtendingManyInputs = false) { if (SubMask.empty()) return; + assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) && + "SubMask with many inputs support must be larger than the mask."); if (Mask.empty()) { Mask.append(SubMask.begin(), SubMask.end()); return; @@ -956,8 +961,9 @@ SmallVector NewMask(SubMask.size(), UndefMaskElem); int TermValue = std::min(Mask.size(), SubMask.size()); for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) + if ((!ExtendingManyInputs && + (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) || + SubMask[I] == UndefMaskElem) continue; NewMask[I] = Mask[SubMask[I]]; } @@ -6788,6 +6794,8 @@ /// analysis/transformations. class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { bool IsFinalized = false; + SmallVector CommonMask; + SmallVector InVectors; const TargetTransformInfo &TTI; InstructionCost Cost = 0; ArrayRef VectorizedVals; @@ -7009,19 +7017,53 @@ VecTy, std::nullopt, CostKind, 0, EEVTy); } } + InVectors.assign(1, VecBase); return VecBase; } + void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign( + 2, Constant::getNullValue(FixedVectorType::get( + E1->Scalars.front()->getType(), + std::max(E1->getVectorFactor(), E2->getVectorFactor())))); + } + void add(const TreeEntry *E1, ArrayRef Mask) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign( + 1, Constant::getNullValue(FixedVectorType::get( + E1->Scalars.front()->getType(), E1->getVectorFactor()))); + } void gather(ArrayRef VL, Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); + if (!Root) { + assert(InVectors.empty() && "Unexpected input vectors for buildvector."); + InVectors.assign(1, Constant::getNullValue(FixedVectorType::get( + VL.front()->getType(), VL.size()))); + } } /// Finalize emission of the shuffles. - InstructionCost finalize() { + InstructionCost finalize(ArrayRef ExtMask) { IsFinalized = true; - return Cost; + ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); + if (CommonMask.empty()) + return Cost; + int Limit = CommonMask.size() * 2; + if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(CommonMask)) + return Cost; + return Cost + + TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc + : TTI::SK_PermuteSingleSrc, + FixedVectorType::get( + cast(InVectors.front()->getType()) + ->getElementType(), + CommonMask.size()), + CommonMask); } ~ShuffleCostEstimator() { - assert(IsFinalized && "Shuffle construction must be finalized."); + assert((IsFinalized || CommonMask.empty()) && + "Shuffle construction must be finalized."); } }; @@ -7109,35 +7151,30 @@ if (Mask[I] != UndefMaskElem) GatheredScalars[I] = PoisonValue::get(ScalarTy); } - InstructionCost GatherCost = 0; - int Limit = Mask.size() * 2; - if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask)) { - // Perfect match in the graph, will reuse the previously vectorized - // node. Cost is 0. - LLVM_DEBUG( - dbgs() - << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); - if (NeedToShuffleReuses) - GatherCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - } else { - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle that starts with " - << *VL.front() << ".\n"); - // Detected that instead of gather we can emit a shuffle of single/two - // previously vectorized nodes. Add the cost of the permutation rather - // than gather. - ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); - } + LLVM_DEBUG( + int Limit = Mask.size() * 2; + if (*GatherShuffle == TTI::SK_PermuteSingleSrc && + all_of(Mask, [=](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask)) { + // Perfect match in the graph, will reuse the previously + // vectorized node. Cost is 0. + dbgs() << "SLP: perfect diamond match for gather bundle " + "that starts with " + << *VL.front() << ".\n"; + } else { + dbgs() << "SLP: shuffled " << Entries.size() + << " entries for bundle that starts with " << *VL.front() + << ".\n"; + }); + if (Entries.size() == 1) + Estimator.add(Entries.front(), Mask); + else + Estimator.add(Entries.front(), Entries.back(), Mask); Estimator.gather( GatheredScalars, Constant::getNullValue(FixedVectorType::get( GatheredScalars.front()->getType(), GatheredScalars.size()))); - return GatherCost + Estimator.finalize(); + return Estimator.finalize(E->ReuseShuffleIndices); } if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { // Check that gather of extractelements can be represented as just a @@ -7147,17 +7184,15 @@ // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); - if (NeedToShuffleReuses) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - return Cost + Estimator.finalize(); - } - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) - ReuseShuffleCost = TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - Estimator.gather(GatheredScalars); - return ReuseShuffleCost + Estimator.finalize(); + return Cost + Estimator.finalize(E->ReuseShuffleIndices); + } + Estimator.gather( + GatheredScalars, + (ExtractShuffle || GatherShuffle) + ? Constant::getNullValue(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())) + : nullptr); + return Estimator.finalize(E->ReuseShuffleIndices); } InstructionCost CommonCost = 0; SmallVector Mask; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -93,20 +93,36 @@ } define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_diff_preds( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 -; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false -; CHECK-NEXT: ret i1 [[S3]] +; SSE-LABEL: @logical_and_icmp_diff_preds( +; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]] +; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false +; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; SSE-NEXT: ret i1 [[S3]] +; +; AVX-LABEL: @logical_and_icmp_diff_preds( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> , <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 +; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false +; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false +; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false +; AVX-NEXT: ret i1 [[S3]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1