diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -631,27 +631,26 @@ /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl &Order) { const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector MaskedIndices; + SmallBitVector UnusedIndices(Sz, /*t=*/true); + SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) - UsedIndices.set(Order[I]); + UnusedIndices.reset(Order[I]); else - MaskedIndices.push_back(I); + MaskedIndices.set(I); } - if (MaskedIndices.empty()) + if (MaskedIndices.none()) return; - SmallVector AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; + assert(UnusedIndices.count() == MaskedIndices.count() && + "Non-synced masked/available indices."); + int Idx = UnusedIndices.find_first(); + int MIdx = MaskedIndices.find_first(); + while (MIdx >= 0) { + assert(Idx >= 0 && "Indices must be synced."); + Order[MIdx] = Idx; + Idx = UnusedIndices.find_next(Idx); + MIdx = MaskedIndices.find_next(MIdx); + } } namespace llvm { @@ -2841,17 +2840,31 @@ return; } if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle()) || + (all_of(TE->Scalars, + [](Value *V) { + return isa(V); + }) && + any_of(TE->Scalars, + [](Value *V) { return isa(V); }))) && + all_of(TE->Scalars, + [](Value *V) { + auto *EE = dyn_cast(V); + return !EE || + isa(EE->getVectorOperandType()); + }) && + allSameType(TE->Scalars)) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector. OrdersType CurrentOrder; bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); if (Reuse || !CurrentOrder.empty()) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); GathersToOrders.try_emplace(TE.get(), CurrentOrder); return; @@ -3004,11 +3017,23 @@ return; } if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle()) || + (all_of(TE->Scalars, + [](Value *V) { + return isa(V); + }) && + any_of(TE->Scalars, + [](Value *V) { return isa(V); }))) && + all_of(TE->Scalars, + [](Value *V) { + auto *EE = dyn_cast(V); + return !EE || + isa(EE->getVectorOperandType()); + }) && + allSameType(TE->Scalars)) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector with a single user only. OrdersType CurrentOrder; @@ -3021,6 +3046,8 @@ Entry.get() != TE.get() && Entry->isSame(TE->Scalars); })) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); OrderedEntries.insert(TE.get()); GathersToOrders.try_emplace(TE.get(), CurrentOrder); return; @@ -4219,10 +4246,17 @@ bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, SmallVectorImpl &CurrentOrder) const { - Instruction *E0 = cast(OpValue); - assert(E0->getOpcode() == Instruction::ExtractElement || - E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + const auto *It = find_if(VL, [](Value *V) { + return isa(V); + }); + assert(It != VL.end() && "Expected at least one extract instruction."); + auto *E0 = cast(*It); + assert(all_of(VL, + [](Value *V) { + return isa( + V); + }) && + "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -4255,23 +4289,28 @@ // Also, later we can check that all the indices are used and we have a // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E + 1); + CurrentOrder.assign(E, E); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast(VL[I]); + auto *Inst = dyn_cast(VL[I]); + if (!Inst) + continue; if (Inst->getOperand(0) != Vec) break; + if (auto *EE = dyn_cast(Inst)) + if (isa(EE->getIndexOperand())) + continue; Optional Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) break; ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { - if (CurrentOrder[I] != E + 1) + if (CurrentOrder[I] != E) break; CurrentOrder[I] = I; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll @@ -8,11 +8,11 @@ ; CHECK: body: ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00 ; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] ; CHECK: exit: