diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -631,27 +631,26 @@ /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl &Order) { const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector MaskedIndices; + SmallBitVector UnusedIndices(Sz, /*t=*/true); + SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) - UsedIndices.set(Order[I]); + UnusedIndices.reset(Order[I]); else - MaskedIndices.push_back(I); + MaskedIndices.set(I); } - if (MaskedIndices.empty()) + if (MaskedIndices.none()) return; - SmallVector AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; + assert(UnusedIndices.count() == MaskedIndices.count() && + "Non-synced masked/available indices."); + int Idx = UnusedIndices.find_first(); + int MIdx = MaskedIndices.find_first(); + while (MIdx >= 0) { + assert(Idx >= 0 && "Indices must be synced."); + Order[MIdx] = Idx; + Idx = UnusedIndices.find_next(Idx); + MIdx = MaskedIndices.find_next(MIdx); + } } namespace llvm { @@ -812,6 +811,13 @@ /// ExtractElement, ExtractValue), which can be part of the graph. Optional findReusedOrderedScalars(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized + /// - just return ReorderIndices, otherwise check if the scalars can be + /// reordered and return the most optimal order. + /// \param TopToBottom If true, include the order of vectorized stores and + /// insertelement nodes, otherwise skip them. + Optional getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -2819,6 +2825,50 @@ return None; } +Optional BoUpSLP::getReorderingData(const TreeEntry &TE, + bool TopToBottom) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE.ReuseShuffleIndices.empty()) + return None; + if (TE.State == TreeEntry::Vectorize && + (isa(TE.getMainOp()) || + (TopToBottom && isa(TE.getMainOp()))) && + !TE.isAltShuffle()) + return TE.ReorderIndices; + if (TE.State == TreeEntry::NeedToGather) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE.getOpcode() == Instruction::ExtractElement && + !TE.isAltShuffle()) || + (all_of(TE.Scalars, + [](Value *V) { + return isa(V); + }) && + any_of(TE.Scalars, + [](Value *V) { return isa(V); }))) && + all_of(TE.Scalars, + [](Value *V) { + auto *EE = dyn_cast(V); + return !EE || isa(EE->getVectorOperandType()); + }) && + allSameType(TE.Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); + return CurrentOrder; + } + } + if (Optional CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; + } + return None; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -2829,39 +2879,11 @@ // Currently the are vectorized loads,extracts + some gathering of extracts. for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( const std::unique_ptr &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/true)) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -2993,44 +3015,11 @@ const std::unique_ptr &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of(VectorizableTree, - [&TE](const std::unique_ptr &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && - Entry->isSame(TE->Scalars); - })) { - OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - OrderedEntries.insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -4219,10 +4208,17 @@ bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, SmallVectorImpl &CurrentOrder) const { - Instruction *E0 = cast(OpValue); - assert(E0->getOpcode() == Instruction::ExtractElement || - E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + const auto *It = find_if(VL, [](Value *V) { + return isa(V); + }); + assert(It != VL.end() && "Expected at least one extract instruction."); + auto *E0 = cast(*It); + assert(all_of(VL, + [](Value *V) { + return isa( + V); + }) && + "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -4255,23 +4251,28 @@ // Also, later we can check that all the indices are used and we have a // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E + 1); + CurrentOrder.assign(E, E); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast(VL[I]); + auto *Inst = dyn_cast(VL[I]); + if (!Inst) + continue; if (Inst->getOperand(0) != Vec) break; + if (auto *EE = dyn_cast(Inst)) + if (isa(EE->getIndexOperand())) + continue; Optional Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) break; ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { - if (CurrentOrder[I] != E + 1) + if (CurrentOrder[I] != E) break; CurrentOrder[I] = I; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll @@ -8,11 +8,11 @@ ; CHECK: body: ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00 ; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] ; CHECK: exit: