diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9404,8 +9404,11 @@ continue; auto *EI = cast(E->Scalars[I]); VecBase = EI->getVectorOperand(); - // TODO: EI can be erased, if all its users are vectorized. But need to - // emit shuffles for such extractelement instructions. + // If all users are vectorized - can delete the extractelement itself. + if (any_of(EI->users(), + [&](User *U) { return !ScalarToTreeEntry.count(U); })) + continue; + eraseInstruction(EI); } return VecBase; }; @@ -9464,7 +9467,6 @@ Value *Vec = nullptr; SmallVector Mask; SmallVector ExtractMask; - SmallVector ReuseMask; std::optional ExtractShuffle; std::optional GatherShuffle; SmallVector Entries; @@ -9515,6 +9517,95 @@ } } } + auto TryPackScalars = [&](SmallVectorImpl &Scalars, + SmallVectorImpl &ReuseMask, + bool IsRootPoison) { + // For splats with can emit broadcasts instead of gathers, so try to find + // such sequences. + bool IsSplat = IsRootPoison && isSplat(Scalars) && + (Scalars.size() > 2 || Scalars.front() == Scalars.back()); + Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy)); + SmallVector UndefPos; + DenseMap UniquePositions; + // Gather unique non-const values and all constant values. + // For repeated values, just shuffle them. + int NumNonConsts = 0; + int SinglePos = 0; + for (auto [I, V] : enumerate(Scalars)) { + if (isa(V)) { + if (!isa(V)) { + ReuseMask[I] = I; + UndefPos.push_back(I); + } + continue; + } + if (isConstant(V)) { + ReuseMask[I] = I; + continue; + } + ++NumNonConsts; + SinglePos = I; + Value *OrigV = V; + Scalars[I] = PoisonValue::get(ScalarTy); + if (IsSplat) { + Scalars.front() = OrigV; + ReuseMask[I] = 0; + } else { + const auto Res = UniquePositions.try_emplace(OrigV, I); + Scalars[Res.first->second] = OrigV; + ReuseMask[I] = Res.first->second; + } + } + if (NumNonConsts == 1) { + // Restore single insert element. + if (IsSplat) { + ReuseMask.assign(VF, UndefMaskElem); + std::swap(Scalars.front(), Scalars[SinglePos]); + if (!UndefPos.empty() && UndefPos.front() == 0) + Scalars.front() = UndefValue::get(ScalarTy); + } + ReuseMask[SinglePos] = SinglePos; + } else if (!UndefPos.empty() && IsSplat) { + // For undef values, try to replace them with the simple broadcast. + // We can do it if the broadcasted value is guaranteed to be + // non-poisonous, or by freezing the incoming scalar value first. + auto *It = find_if(Scalars, [this, E](Value *V) { + return !isa(V) && + (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || + (E->UserTreeIndices.size() == 1 && + any_of(V->uses(), [E](const Use &U) { + // Check if the value already used in the same operation in + // one of the nodes already. + return E->UserTreeIndices.front().EdgeIdx != + U.getOperandNo() && + is_contained( + E->UserTreeIndices.front().UserTE->Scalars, + U.getUser()); + }))); + }); + if (It != Scalars.end()) { + // Replace undefs by the non-poisoned scalars and emit broadcast. + int Pos = std::distance(Scalars.begin(), It); + for_each(UndefPos, [&](int I) { + // Set the undef position to the non-poisoned scalar. + ReuseMask[I] = Pos; + // Replace the undef by the poison, in the mask it is replaced by + // non-poisoned scalar already. + if (I != Pos) + Scalars[I] = PoisonValue::get(ScalarTy); + }); + } else { + // Replace undefs by the poisons, emit broadcast and then emit + // freeze. + for_each(UndefPos, [&](int I) { + ReuseMask[I] = UndefMaskElem; + if (isa(Scalars[I])) + Scalars[I] = PoisonValue::get(ScalarTy); + }); + NeedFreeze = true; + } + } + }; if (ExtractShuffle || GatherShuffle) { bool IsNonPoisoned = true; bool IsUsedInExpr = false; @@ -9542,6 +9633,8 @@ } } if (Vec2) { + IsNonPoisoned &= + isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); ShuffleBuilder.add(Vec1, Vec2, ExtractMask); } else if (Vec1) { ShuffleBuilder.add(Vec1, ExtractMask); @@ -9562,6 +9655,9 @@ } else { ShuffleBuilder.add(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && + isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); } } // Try to figure out best way to combine values: build a shuffle and insert @@ -9610,11 +9706,8 @@ // Generate constants for final shuffle and build a mask for them. if (!all_of(GatheredScalars, PoisonValue::classof)) { SmallVector BVMask(GatheredScalars.size(), UndefMaskElem); + TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); Value *BV = gather(GatheredScalars); - for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { - if (!isa(GatheredScalars[I])) - BVMask[I] = I; - } ShuffleBuilder.add(BV, BVMask); } if (all_of(NonConstants, [=](Value *V) { @@ -9627,111 +9720,25 @@ Vec = ShuffleBuilder.finalize( E->ReuseShuffleIndices, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { + TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = gather(NonConstants, Vec); - for (unsigned I = 0, Sz = Mask.size(); I < Sz; ++I) - if (!isa(NonConstants[I])) - Mask[I] = I; }); - } else if (!allConstant(E->Scalars)) { - // TODO: remove this code once able to combine shuffled vectors and build - // vector elements. - copy(E->Scalars, GatheredScalars.begin()); - // For splats with can emit broadcasts instead of gathers, so try to find - // such sequences. - bool IsSplat = isSplat(GatheredScalars) && - (GatheredScalars.size() > 2 || - GatheredScalars.front() == GatheredScalars.back()); - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - ReuseMask.assign(VF, UndefMaskElem); - SmallVector UndefPos; - DenseMap UniquePositions; - // Gather unique non-const values and all constant values. - // For repeated values, just shuffle them. - int NumNonConsts = 0; - int SinglePos = 0; - for (auto [I, V] : enumerate(GatheredScalars)) { - if (isa(V)) { - if (!isa(V)) { - ReuseMask[I] = I; - UndefPos.push_back(I); - } - continue; - } - if (isConstant(V)) { - ReuseMask[I] = I; - continue; - } - ++NumNonConsts; - SinglePos = I; - Value *OrigV = V; - GatheredScalars[I] = PoisonValue::get(ScalarTy); - if (IsSplat) { - GatheredScalars.front() = OrigV; - ReuseMask[I] = 0; - } else { - const auto Res = UniquePositions.try_emplace(OrigV, I); - GatheredScalars[Res.first->second] = OrigV; - ReuseMask[I] = Res.first->second; - } - } - if (NumNonConsts == 1) { - // Restore single insert element. - if (IsSplat) { - ReuseMask.assign(VF, UndefMaskElem); - std::swap(GatheredScalars.front(), GatheredScalars[SinglePos]); - if (!UndefPos.empty() && UndefPos.front() == 0) - GatheredScalars.front() = UndefValue::get(ScalarTy); - } - ReuseMask[SinglePos] = SinglePos; - } else if (!UndefPos.empty() && IsSplat) { - // For undef values, try to replace them with the simple broadcast. - // We can do it if the broadcasted value is guaranteed to be - // non-poisonous, or by freezing the incoming scalar value first. - auto *It = find_if(GatheredScalars, [this, E](Value *V) { - return !isa(V) && - (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || - (E->UserTreeIndices.size() == 1 && - any_of(V->uses(), [E](const Use &U) { - // Check if the value already used in the same operation in - // one of the nodes already. - return E->UserTreeIndices.front().EdgeIdx != - U.getOperandNo() && - is_contained( - E->UserTreeIndices.front().UserTE->Scalars, - U.getUser()); - }))); - }); - if (It != GatheredScalars.end()) { - // Replace undefs by the non-poisoned scalars and emit broadcast. - int Pos = std::distance(GatheredScalars.begin(), It); - for_each(UndefPos, [&](int I) { - // Set the undef position to the non-poisoned scalar. - ReuseMask[I] = Pos; - // Replace the undef by the poison, in the mask it is replaced by - // non-poisoned scalar already. - if (I != Pos) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - }); - } else { - // Replace undefs by the poisons, emit broadcast and then emit - // freeze. - for_each(UndefPos, [&](int I) { - ReuseMask[I] = UndefMaskElem; - if (isa(GatheredScalars[I])) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - }); - NeedFreeze = true; - } - } + } else if (!allConstant(GatheredScalars)) { // Gather unique scalars and all constants. + SmallVector ReuseMask(GatheredScalars.size(), UndefMaskElem); + TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Vec = gather(GatheredScalars); ShuffleBuilder.add(Vec, ReuseMask); Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { // Gather all constants. + SmallVector Mask(E->Scalars.size(), UndefMaskElem); + for (auto [I, V] : enumerate(E->Scalars)) { + if (!isa(V)) + Mask[I] = I; + } Vec = gather(E->Scalars); - ShuffleBuilder.add(Vec, ReuseMask); + ShuffleBuilder.add(Vec, Mask); Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @a, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SUB102_3]], i32 12 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_3]], i32 15 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32>