diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12270,138 +12270,140 @@ BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - int E = Stores.size(); - SmallBitVector Tails(E, false); - int MaxIter = MaxStoreLookup.getValue(); - SmallVector, 16> ConsecutiveChain( - E, std::make_pair(E, INT_MAX)); - SmallVector CheckedPairs(E, SmallBitVector(E, false)); - int IterCnt; - auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, - &CheckedPairs, - &ConsecutiveChain](int K, int Idx) { - if (IterCnt >= MaxIter) - return true; - if (CheckedPairs[Idx].test(K)) - return ConsecutiveChain[K].second == 1 && - ConsecutiveChain[K].first == Idx; - ++IterCnt; - CheckedPairs[Idx].set(K); - CheckedPairs[K].set(Idx); - std::optional Diff = getPointersDiff( - Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), - Stores[Idx]->getValueOperand()->getType(), - Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); - if (!Diff || *Diff == 0) - return false; - int Val = *Diff; - if (Val < 0) { - if (ConsecutiveChain[Idx].second > -Val) { - Tails.set(K); - ConsecutiveChain[Idx] = std::make_pair(K, -Val); + DenseSet> TriedSequences; + auto TryToVectorize = [&](const auto &Set) { + int PrevDist = -1; + BoUpSLP::ValueList Operands; + // Collect the chain into a list. + for (auto [Idx, Data] : enumerate(Set)) { + if (Operands.empty() || Data.second - PrevDist == 1) { + Operands.push_back(Stores[Data.first]); + PrevDist = Data.second; + if (Idx != Set.size() - 1) + continue; + } + if (Operands.size() <= 1) { + Operands.clear(); + Operands.push_back(Stores[Data.first]); + PrevDist = Data.second; + continue; } - return false; - } - if (ConsecutiveChain[K].second <= Val) - return false; - Tails.set(Idx); - ConsecutiveChain[K] = std::make_pair(Idx, Val); - return Val == 1; - }; - // Do a quadratic search on all of the given stores in reverse order and find - // all of the pairs of stores that follow each other. - for (int Idx = E - 1; Idx >= 0; --Idx) { - // If a store has multiple consecutive store candidates, search according - // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... - // This is because usually pairing with immediate succeeding or preceding - // candidate create the best chance to find slp vectorization opportunity. - const int MaxLookDepth = std::max(E - Idx, Idx + 1); - IterCnt = 0; - for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) - if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || - (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) - break; - } + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Operands[0]); + unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); - // Tracks if we tried to vectorize stores starting from the given tail - // already. - SmallBitVector TriedTails(E, false); - // For stores that start but don't end a link in the chain: - for (int Cnt = E; Cnt > 0; --Cnt) { - int I = Cnt - 1; - if (ConsecutiveChain[I].first == E || Tails.test(I)) - continue; - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - BoUpSLP::ValueList Operands; - // Collect the chain into a list. - while (I != E && !VectorizedStores.count(Stores[I])) { - Operands.push_back(Stores[I]); - Tails.set(I); - if (ConsecutiveChain[I].second != 1) { - // Mark the new end in the chain and go back, if required. It might be - // required if the original stores come in reversed order, for example. - if (ConsecutiveChain[I].first != E && - Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) && - !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { - TriedTails.set(I); - Tails.reset(ConsecutiveChain[I].first); - if (Cnt < ConsecutiveChain[I].first + 2) - Cnt = ConsecutiveChain[I].first + 2; + unsigned MaxVF = + std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); + auto *Store = cast(Operands[0]); + Type *StoreTy = Store->getValueOperand()->getType(); + Type *ValueTy = StoreTy; + if (auto *Trunc = dyn_cast(Store->getValueOperand())) + ValueTy = Trunc->getSrcTy(); + unsigned MinVF = TTI->getStoreMinimumVF( + R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); + + if (MaxVF <= MinVF) { + LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF + << ") <= " + << "MinVF (" << MinVF << ")\n"); + } + + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + unsigned StartIdx = 0; + for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) + .second && + vectorizeStoreChain(Slice, R, Cnt, MinVF)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; + } + ++Cnt; } - break; + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) + break; } - // Move to the next value in the chain. - I = ConsecutiveChain[I].first; + Operands.clear(); + Operands.push_back(Stores[Data.first]); + PrevDist = Data.second; } - assert(!Operands.empty() && "Expected non-empty list of stores."); + }; - unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(Operands[0]); - unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize); - - unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), - MaxElts); - auto *Store = cast(Operands[0]); - Type *StoreTy = Store->getValueOperand()->getType(); - Type *ValueTy = StoreTy; - if (auto *Trunc = dyn_cast(Store->getValueOperand())) - ValueTy = Trunc->getSrcTy(); - unsigned MinVF = TTI->getStoreMinimumVF( - R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); - - if (MaxVF <= MinVF) { - LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= " - << "MinVF (" << MinVF << ")\n"); - } - - // FIXME: Is division-by-2 the correct step? Should we assert that the - // register size is a power-of-2? - unsigned StartIdx = 0; - for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; - } - ++Cnt; + struct StoreDistCompare { + bool operator()(const std::pair &Op1, + const std::pair &Op2) const { + return Op1.second < Op2.second; + } + }; + SmallVector< + std::pair, StoreDistCompare>>> + SortedStores; + auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { + for (auto &Set : SortedStores) { + std::optional Diff = getPointersDiff( + Stores[Set.first]->getValueOperand()->getType(), + Stores[Set.first]->getPointerOperand(), + SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + if (!Diff) + continue; + auto It = Set.second.find(std::make_pair(Idx, *Diff)); + if (It == Set.second.end()) { + Set.second.emplace(Idx, *Diff); + return; } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) - break; + // Try to vectorize the first found set to avoid duplicate analysis. + TryToVectorize(Set.second); + decltype(Set.second) PrevSet; + PrevSet.swap(Set.second); + Set.first = Idx; + Set.second.emplace(Idx, 0); + // Insert stores that followed previous match to try to vectorize them + // with this store. + unsigned StartIdx = It->first + 1; + SmallBitVector UsedStores(Idx - StartIdx); + // Distances to previously found dup store (or this store, since they + // store to the same addresses). + SmallVector Dists(Idx - StartIdx, 0); + for (const std::pair &Pair : reverse(PrevSet)) { + // Do not try to vectorize sequences, we already tried. + if (Pair.first <= It->first || + VectorizedStores.contains(Stores[Pair.first])) + break; + unsigned BI = Pair.first - StartIdx; + UsedStores.set(BI); + Dists[BI] = Pair.second - It->second; + } + for (unsigned I = StartIdx; I < Idx; ++I) { + unsigned BI = I - StartIdx; + if (BI < UsedStores.size() && UsedStores.test(BI)) + Set.second.emplace(I, Dists[BI]); + } + return; } - } + auto &Res = SortedStores.emplace_back(); + Res.first = Idx; + Res.second.emplace(Idx, 0); + }; + for (auto [I, SI] : enumerate(Stores)) + FillStoresSet(I, SI); + + // Final vectorization attempt. + for (auto &Set : SortedStores) + TryToVectorize(Set.second); return Changed; } @@ -15035,8 +15037,13 @@ if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) continue; + // Reverse stores to do bottom-to-top analysis. This is important if the + // values are stores to the same addresses several times, in this case need + // to follow the stores order (reversed to meet the memory dependecies). + SmallVector ReversedStores(Pair.second.rbegin(), + Pair.second.rend()); Changed |= tryToVectorizeSequence( - Pair.second, StoreSorter, AreCompatibleStores, + ReversedStores, StoreSorter, AreCompatibleStores, [this, &R](ArrayRef Candidates, bool) { return vectorizeStores(Candidates, R); }, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/many_stores.ll @@ -5,10 +5,6 @@ ; CHECK-LABEL: define i32 @test ; CHECK-SAME: (ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[IDX2:%.*]] = getelementptr i32, ptr [[P]], i64 4 -; CHECK-NEXT: store i32 0, ptr [[IDX2]], align 4 -; CHECK-NEXT: [[IDX3:%.*]] = getelementptr i32, ptr [[P]], i64 6 -; CHECK-NEXT: store i32 0, ptr [[IDX3]], align 4 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr i32, ptr [[P]], i64 8 ; CHECK-NEXT: store i32 0, ptr [[IDX4]], align 4 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr i32, ptr [[P]], i64 10 @@ -69,9 +65,7 @@ ; CHECK-NEXT: store i32 0, ptr [[IDX33]], align 4 ; CHECK-NEXT: store i32 0, ptr [[P]], align 4 ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr i32, ptr [[P]], i64 3 -; CHECK-NEXT: store i32 0, ptr [[IDX0]], align 4 -; CHECK-NEXT: [[IDX1:%.*]] = getelementptr i32, ptr [[P]], i64 5 -; CHECK-NEXT: store i32 0, ptr [[IDX1]], align 4 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[IDX0]], align 4 ; CHECK-NEXT: ret i32 0 ; entry: