diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4287,8 +4287,8 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I, ArrayRef VectorizedVals) const { return (I->hasOneUse() && is_contained(VectorizedVals, I)) || - llvm::all_of(I->users(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + all_of(I->users(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U); }); } @@ -4442,9 +4442,9 @@ // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, - VectorizedVals](InstructionCost &Cost, - bool IsGather) { + VectorizedVals, E](InstructionCost &Cost) { DenseMap ExtractVectorsTys; + SmallPtrSet CheckedExtracts; for (auto *V : VL) { if (isa(V)) continue; @@ -4452,7 +4452,12 @@ // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast(V), VectorizedVals)) + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !areAllUsersVectorized(cast(V), VectorizedVals) || + (VE && VE != E)) continue; auto *EE = cast(V); Optional EEIdx = getExtractIndex(EE); @@ -4549,11 +4554,6 @@ } return GatherCost; } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); - } if ((E->getOpcode() == Instruction::ExtractElement || all_of(E->Scalars, [](Value *V) { @@ -4571,13 +4571,18 @@ // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost, /*IsGather=*/true); + AdjustExtractsCost(Cost); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return Cost; } } + if (isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( @@ -4755,7 +4760,7 @@ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } else { - AdjustExtractsCost(CommonCost, /*IsGather=*/false); + AdjustExtractsCost(CommonCost); } return CommonCost; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll @@ -2,24 +2,25 @@ ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -pass-remarks-output=%t | FileCheck %s ; RUN: FileCheck %s --input-file=%t --check-prefix=YAML -; YAML: --- !Missed +; YAML: --- !Passed ; YAML: Pass: slp-vectorizer -; YAML: Name: NotBeneficial +; YAML: Name: VectorizedList ; YAML: Function: multi_uses ; YAML: Args: -; YAML: - String: 'List vectorization was possible but not beneficial with cost ' -; YAML: - Cost: '0' -; YAML: - String: ' >= ' -; YAML: - Treshold: '0' +; YAML: - String: 'SLP vectorized with cost ' +; YAML: - Cost: '-1' +; YAML: - String: ' and with tree size ' +; YAML: - TreeSize: '3' define float @multi_uses(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @multi_uses( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 ; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x float> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[Y1]] -; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[Y1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Y1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Y1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0