Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -693,6 +693,13 @@ void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); + + /// Kind of the tree entry. + enum TreeEntryKind { + TEK_Vectorize, /// Entry must be vectorized. + TEK_Gather, /// Entry is the set of gather scalars. + TEK_EEShuffle, /// Entry is the shuffle of the ExtractElementInstructions. + }; struct TreeEntry { TreeEntry(std::vector &Container) : Container(Container) {} @@ -712,8 +719,9 @@ /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; - /// Do we need to gather this sequence ? - bool NeedToGather = false; + /// What should be done with the tree entry: vectorization, gathering or + /// shuffling of ExtractElement|InsertElements. + TreeEntryKind VectorizationKind = TEK_Vectorize; /// Does this sequence require some shuffling? SmallVector ReuseShuffleIndices; @@ -732,21 +740,21 @@ }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + void newTreeEntry(ArrayRef VL, TreeEntryKind Kind, int &UserTreeIdx, ArrayRef ReuseShuffleIndices = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->NeedToGather = !Vectorized; + Last->VectorizationKind = Kind; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - if (Vectorized) { + if (Kind == TEK_Vectorize) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); ScalarToTreeEntry[VL[i]] = idx; } - } else { + } else if (Kind == TEK_Gather) { MustGather.insert(VL.begin(), VL.end()); } @@ -1306,7 +1314,7 @@ static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { - if (Entry->NeedToGather) + if (Entry->VectorizationKind != BoUpSLP::TEK_Vectorize) return "color=red"; return ""; } @@ -1334,7 +1342,7 @@ TreeEntry *Entry = &EIdx; // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->VectorizationKind != TEK_Vectorize) continue; // For each lane: @@ -1371,7 +1379,7 @@ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!UseEntry->NeedToGather && "Bad state"); + assert(UseEntry->VectorizationKind == TEK_Vectorize && "Bad state"); continue; } } @@ -1395,28 +1403,28 @@ InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } @@ -1428,7 +1436,7 @@ if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } } @@ -1438,7 +1446,7 @@ DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -1456,7 +1464,7 @@ if (getTreeEntry(I)) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } } @@ -1466,7 +1474,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } } @@ -1480,7 +1488,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } @@ -1500,7 +1508,7 @@ DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, TEK_Gather, UserTreeIdx); return; } VL = UniqueValues; @@ -1517,7 +1525,7 @@ assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1536,12 +1544,12 @@ if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1557,17 +1565,22 @@ } case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = canReuseExtract(VL, VL0); - if (Reuse) { + TreeEntryKind Kind = TEK_Gather; + if (canReuseExtract(VL, VL0)) { DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOrder[S.Opcode]; + Kind = TEK_Vectorize; } else { - SmallVector ReverseVL(VL.rbegin(), VL.rend()); - if (canReuseExtract(ReverseVL, VL0)) + if (canReuseExtract(llvm::to_vector<4>(llvm::reverse(VL)), VL0)) { --NumOpsWantToKeepOrder[S.Opcode]; + } else if (ShuffleOrOp == Instruction::ExtractElement && + isShuffle(VL).hasValue()) { + ++NumOpsWantToKeepOrder[S.Opcode]; + Kind = TEK_EEShuffle; + } BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, Kind, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::Load: { @@ -1582,7 +1595,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1593,7 +1606,7 @@ LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1615,7 +1628,7 @@ if (Consecutive) { ++NumOpsWantToKeepOrder[S.Opcode]; - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1630,7 +1643,7 @@ } BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); if (ReverseConsecutive) { --NumOpsWantToKeepOrder[S.Opcode]; @@ -1657,12 +1670,12 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1685,13 +1698,13 @@ if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1723,7 +1736,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1752,7 +1765,7 @@ if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1765,7 +1778,7 @@ if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1777,12 +1790,12 @@ DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1799,12 +1812,12 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1822,7 +1835,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1836,7 +1849,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1847,7 +1860,7 @@ Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1860,14 +1873,14 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1884,11 +1897,11 @@ // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Vectorize, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1912,7 +1925,7 @@ default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, TEK_Gather, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2012,38 +2025,21 @@ ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->NeedToGather) { + if (E->VectorizationKind == TEK_Gather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).Opcode == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); - if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); - } - } - return ReuseShuffleCost + Cost; - } - } return ReuseShuffleCost + getGatherCost(VL); } InstructionsState S = getSameOpcode(VL); assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + assert((E->VectorizationKind == TEK_Vectorize || + (E->VectorizationKind == TEK_EEShuffle && + S.Opcode == Instruction::ExtractElement)) && + "Must be vectorization entry or ExtractElementInst shuffle."); Instruction *VL0 = cast(S.OpValue); unsigned ShuffleOrOp = S.IsAltShuffle ? (unsigned) Instruction::ShuffleVector : S.Opcode; @@ -2081,7 +2077,26 @@ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (canReuseExtract(VL, S.OpValue)) { + if (E->VectorizationKind == TEK_EEShuffle) { + Optional ShuffleKind = isShuffle(VL); + assert(ShuffleKind.hasValue() && "Must be shuffle of extractelements"); + int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + for (auto *V : VL) { + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of + // the vectorized tree. + if (areAllUsersVectorized(cast(V)) && + !ScalarToTreeEntry.count(V)) { + auto *IO = cast( + cast(V)->getIndexOperand()); + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + IO->getZExtValue()); + } + } + return ReuseShuffleCost + Cost; + } + if (E->VectorizationKind == TEK_Vectorize) { int DeadCost = ReuseShuffleCost; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); @@ -2344,20 +2359,22 @@ VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather) + if (VectorizableTree.size() == 1 && + VectorizableTree[0].VectorizationKind == TEK_Vectorize) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0].NeedToGather && + if (VectorizableTree[0].VectorizationKind == TEK_Vectorize && (allConstant(VectorizableTree[1].Scalars) || isSplat(VectorizableTree[1].Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + if (VectorizableTree[0].VectorizationKind != TEK_Vectorize || + VectorizableTree[1].VectorizationKind != TEK_Vectorize) return false; return true; @@ -2935,7 +2952,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { + if (E->VectorizationKind == TEK_Gather) { setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { @@ -2949,6 +2966,10 @@ E->VectorizedValue = V; return V; } + assert((E->VectorizationKind == TEK_Vectorize || + (E->VectorizationKind == TEK_EEShuffle && + S.Opcode == Instruction::ExtractElement)) && + "Must be vectorization entry or ExtractElementInst shuffle."); unsigned ShuffleOrOp = S.IsAltShuffle ? (unsigned) Instruction::ShuffleVector : S.Opcode; @@ -2994,56 +3015,44 @@ } case Instruction::ExtractElement: { - if (canReuseExtract(E->Scalars, VL0)) { - Value *V = VL0->getOperand(0); - if (NeedToShuffleReuses) { - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; - return V; + Value *V; + if (E->VectorizationKind == TEK_Vectorize) { + V = VL0->getOperand(0); + } else { + assert(E->VectorizationKind == TEK_EEShuffle && + "Only shuffle is expected."); + setInsertPointAfterBundle(E->Scalars, VL0); + V = Gather(E->Scalars, VecTy); } - setInsertPointAfterBundle(E->Scalars, VL0); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (E->VectorizationKind != TEK_Vectorize) { + if (auto *I = dyn_cast(V)) { + GatherSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (canReuseExtract(E->Scalars, VL0)) { - LoadInst *LI = cast(VL0->getOperand(0)); - Builder.SetInsertPoint(LI); - PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (NeedToShuffleReuses) { - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; - } - setInsertPointAfterBundle(E->Scalars, VL0); - auto *V = Gather(E->Scalars, VecTy); + assert(E->VectorizationKind == TEK_Vectorize && + "Unable to reuse extractvalues"); + LoadInst *LI = cast(VL0->getOperand(0)); + Builder.SetInsertPoint(LI); + PointerType *PtrTy = + PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + Value *NewV = propagateMetadata(V, E->Scalars); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); } - E->VectorizedValue = V; - return V; + E->VectorizedValue = NewV; + return NewV; } case Instruction::ZExt: case Instruction::SExt: @@ -3466,7 +3475,8 @@ continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert(E->VectorizationKind == TEK_Vectorize && + "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -3537,7 +3547,7 @@ TreeEntry *Entry = &EIdx; // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->VectorizationKind != TEK_Vectorize) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); Index: test/Transforms/SLPVectorizer/X86/extract-shuffle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract-shuffle.ll +++ test/Transforms/SLPVectorizer/X86/extract-shuffle.ll @@ -5,13 +5,10 @@ ; CHECK-LABEL: @g( ; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> undef, i8 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0 -; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1 -; CHECK-NEXT: [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[TMP5]], i32 1 +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] +; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] +; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1 ; CHECK-NEXT: ret <2 x i8> [[INS2]] ; %x0 = extractelement <2 x i8> %x, i32 0