diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -322,26 +322,6 @@ /// i32 6> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 -/// We convert this initially to something like: -/// %x0 = extractelement <4 x i8> %x, i32 0 -/// %x3 = extractelement <4 x i8> %x, i32 3 -/// %y1 = extractelement <4 x i8> %y, i32 1 -/// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 -/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 -/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 -/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 -/// %5 = mul <4 x i8> %4, %4 -/// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 -/// %7 = extractelement <4 x i8> %5, i32 1 -/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 -/// %8 = extractelement <4 x i8> %5, i32 2 -/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 -/// %9 = extractelement <4 x i8> %5, i32 3 -/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 -/// ret <4 x i8> %ins4 -/// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// TargetTransformInfo::getInstructionThroughput? @@ -791,6 +771,8 @@ VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); + GatheredLoads.clear(); + GatheredLoadsEntriesFirst = -1; ExternalUses.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); @@ -816,7 +798,9 @@ /// reordered and return the most optimal order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. - Optional getReorderingData(const TreeEntry &TE, bool TopToBottom); + Optional getReorderingData( + const TreeEntry &TE, bool TopToBottom, + DenseMap &ScatterVectorizeToReorder); /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes @@ -1825,7 +1809,8 @@ /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Mask is filled with the shuffle mask. Optional - isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + isGatherShuffledEntry(const TreeEntry *TE, ArrayRef Scalars, + SmallVectorImpl &Mask, SmallVectorImpl &Entries); /// \returns the scalarization cost for this list of values. Assuming that @@ -1844,6 +1829,19 @@ /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(bool ForReduction) const; + /// Run through the list of all gathered loads in the graph and try to find + /// vector loads/masked gathers instead of regular gathers. Later these loads + /// are reshufled to build final gathered nodes. + void tryToVectorizeGatheredLoads(); + + /// Checks if gather node \p E can be represented as a shuffle of vectorized + /// scalars and perform \p Callback transformation/analysis. + /// \returns The list of gathered loads and poison that should be gathered into + /// final vector. + SmallVector + tryToMatchVector(const TreeEntry *E, + function_ref)> Callback); + /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. static void reorderInputsAccordingToOpcode(ArrayRef VL, @@ -2240,12 +2238,18 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; - /// Maps a value to the proposed vectorizable size. + /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A list of loads to be gathered during the vectorization process. We can + /// try to vectorize them at the end, if profitable. + SmallVector>> GatheredLoads; + /// The index of the first gathered load entry in the VectorizeTree. + int GatheredLoadsEntriesFirst = -1; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -2944,13 +2948,15 @@ if (!isa(V)) continue; if (const auto *LocalSTE = getTreeEntry(V)) { - if (!STE) + if (!STE) { STE = LocalSTE; - else if (STE != LocalSTE) - // Take the order only from the single vector node. + if (STE->getVectorFactor() != NumScalars) + return None; + } else if (STE != LocalSTE) { + // Take the order only from the single vector node of the same size. return None; - unsigned Lane = - std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); + } + unsigned Lane = STE->findLaneForValue(V); if (Lane >= NumScalars) return None; if (CurrentOrder[Lane] != NumScalars) { @@ -2994,8 +3000,77 @@ return None; } -Optional BoUpSLP::getReorderingData(const TreeEntry &TE, - bool TopToBottom) { +namespace { +/// Tracks the state we can represent the loads in the given sequence. +enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +} // anonymous namespace + +/// Checks if the given array of loads can be represented as a vectorized, +/// scatter or just simple gather. +static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, + const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &Order, + SmallVectorImpl &PointerOps) { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + Type *ScalarTy = VL0->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) + return LoadsState::Gather; + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. + PointerOps.clear(); + PointerOps.resize(VL.size()); + auto *POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast(V); + if (!L->isSimple()) + return LoadsState::Gather; + *POIter = L->getPointerOperand(); + ++POIter; + } + + Order.clear(); + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + Optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + // Check that the sorted loads are consecutive. + if (static_cast(*Diff) == VL.size() - 1) + return LoadsState::Vectorize; + Align CommonAlignment = cast(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), + CommonAlignment)) + return LoadsState::ScatterVectorize; + } + + return LoadsState::Gather; +} + +Optional BoUpSLP::getReorderingData( + const TreeEntry &TE, bool TopToBottom, + DenseMap &ScatterVectorizeToReorder) { + if (GatheredLoadsEntriesFirst > 0 && TE.UserTreeIndices.empty() && + &TE != VectorizableTree.front().get()) + return None; // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) @@ -3005,23 +3080,22 @@ (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) return TE.ReorderIndices; - if (TE.State == TreeEntry::NeedToGather) { + bool LoadsScatterVectorize = false; + if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() && + allSameType(TE.Scalars)) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. - if (((TE.getOpcode() == Instruction::ExtractElement && - !TE.isAltShuffle()) || + if ((TE.getOpcode() == Instruction::ExtractElement || (all_of(TE.Scalars, [](Value *V) { return isa(V); }) && any_of(TE.Scalars, [](Value *V) { return isa(V); }))) && - all_of(TE.Scalars, - [](Value *V) { - auto *EE = dyn_cast(V); - return !EE || isa(EE->getVectorOperandType()); - }) && - allSameType(TE.Scalars)) { + all_of(TE.Scalars, [](Value *V) { + auto *EE = dyn_cast(V); + return !EE || isa(EE->getVectorOperandType()); + })) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector. OrdersType CurrentOrder; @@ -3031,9 +3105,27 @@ fixupOrderingIndices(CurrentOrder); return CurrentOrder; } - } - if (Optional CurrentOrder = findReusedOrderedScalars(TE)) + } else if (TE.getOpcode() == Instruction::Load) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), *TTI, + *DL, *SE, CurrentOrder, PointerOps); + if (Res == LoadsState::Vectorize) + return CurrentOrder; + LoadsScatterVectorize = Res == LoadsState::ScatterVectorize; + } + if (Optional CurrentOrder = findReusedOrderedScalars(TE)) { + if (LoadsScatterVectorize) { + TreeEntry *ScatterVectorTE = getTreeEntry(TE.Scalars.front()); + assert(ScatterVectorTE && "No related ScatterVector node found."); + if (ScatterVectorTE->Idx >= GatheredLoadsEntriesFirst && + ScatterVectorTE->UserTreeIndices.empty()) { + ScatterVectorizeToReorder.try_emplace(&TE, ScatterVectorTE); + return None; + } + } return CurrentOrder; + } } return None; } @@ -3044,13 +3136,17 @@ // ExtractElement gather nodes which can be vectorized and need to handle // their ordering. DenseMap GathersToOrders; + // Nodes with loads masked gathering built out of gathered loads that should + // be reordered to avoid extra shuffles. + DenseMap ScatterVectorizeToReorder; // Find all reorderable nodes with the given VF. // Currently the are vectorized stores,loads,extracts + some gathering of // extracts. - for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( + for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders, + &ScatterVectorizeToReorder]( const std::unique_ptr &TE) { - if (Optional CurrentOrder = - getReorderingData(*TE.get(), /*TopToBottom=*/true)) { + if (Optional CurrentOrder = getReorderingData( + *TE.get(), /*TopToBottom=*/true, ScatterVectorizeToReorder)) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); @@ -3124,6 +3220,10 @@ }); // Do an actual reordering, if profitable. for (std::unique_ptr &TE : VectorizableTree) { + // Do not reorder gathered loads. + if (GatheredLoadsEntriesFirst > 0 && TE.get()->UserTreeIndices.empty() && + TE.get() != VectorizableTree.front().get()) + continue; // Just do the reordering for the nodes with the given VF. if (TE->Scalars.size() != VF) { if (TE->ReuseShuffleIndices.size() == VF) { @@ -3180,13 +3280,16 @@ // Currently the are vectorized loads,extracts without alternate operands + // some gathering of extracts. SmallVector NonVectorized; + // Nodes with loads masked gathering built out of gathered loads that should + // be reordered to avoid extra shuffles. + DenseMap ScatterVectorizeToReorder; for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, - &NonVectorized]( + &NonVectorized, &ScatterVectorizeToReorder]( const std::unique_ptr &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - if (Optional CurrentOrder = - getReorderingData(*TE.get(), /*TopToBottom=*/false)) { + if (Optional CurrentOrder = getReorderingData( + *TE.get(), /*TopToBottom=*/false, ScatterVectorizeToReorder)) { OrderedEntries.insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); @@ -3256,6 +3359,8 @@ // search. The graph currently does not provide this dependency directly. for (EdgeInfo &EI : TE->UserTreeIndices) { TreeEntry *UserTE = EI.UserTE; + if (!UserTE) + continue; auto It = Users.find(UserTE); if (It == Users.end()) It = Users.insert({UserTE, {}}).first; @@ -3403,6 +3508,75 @@ if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && VectorizableTree.front()->ReuseShuffleIndices.empty()) VectorizableTree.front()->ReorderIndices.clear(); + // Reorder masked gather nodes built out of gathered loads. + SmallPtrSet Processed; + for (const auto &SVLoadsData : ScatterVectorizeToReorder) { + if (!Processed.insert(SVLoadsData.second).second) + continue; + Optional CurrentOrder = + findReusedOrderedScalars(*SVLoadsData.first); + assert(CurrentOrder && "Expected order."); + if (CurrentOrder->empty() || !SVLoadsData.second->UserTreeIndices.empty()) + continue; + SmallVector Operands; + SmallVector Worklist(1, SVLoadsData.second); + while (!Worklist.empty()) { + TreeEntry *CurrentEntry = Worklist.pop_back_val(); + for (int I = 0, E = CurrentEntry->getNumOperands(); I < E; ++I) { + TreeEntry *Op = getTreeEntry(CurrentEntry->getOperand(I).front()); + if (!Op || !Op->isSame(CurrentEntry->getOperand(I))) { + auto *It = + find_if(VectorizableTree, + [CurrentEntry, I](const std::unique_ptr &TE) { + return TE->State != TreeEntry::Vectorize && + TE->isSame(CurrentEntry->getOperand(I)); + }); + assert(It != VectorizableTree.end() && + "No entry for pointers of ScatterVectorize node."); + Op = It->get(); + } + if (Op->ReuseShuffleIndices.empty()) + Worklist.push_back(Op); + Operands.push_back(Op); + } + } + if (any_of(Operands, [](const TreeEntry *TE) { + return TE->UserTreeIndices.size() != 1 && + (!isSplat(TE->Scalars) || + any_of(TE->Scalars, UndefValue::classof)); + })) + continue; + // Reorder related masked gather node and its operands. + SmallVector Mask(CurrentOrder->size(), UndefMaskElem); + unsigned E = CurrentOrder->size(); + transform(*CurrentOrder, Mask.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : UndefMaskElem; + }); + for (TreeEntry *OpTE : Operands) { + // If there are several users of the pointers tree entry, no need to + // reorder the scatter vectorize node, still have same number of shuffles. + if (!OpTE->ReuseShuffleIndices.empty()) { + reorderReuses(OpTE->ReuseShuffleIndices, Mask); + } else if (OpTE->State == TreeEntry::NeedToGather) { + assert(OpTE->ReorderIndices.empty() && + "Expected no ordering for pointers node."); + reorderScalars(OpTE->Scalars, Mask); + } else { + assert(OpTE->State == TreeEntry::Vectorize && + "Expected Vectorize node"); + assert(OpTE->getNumOperands() == 2 && + "Expected 2 operands for pointers node."); + assert(OpTE->ReorderIndices.empty() && + "Expected no ordering for pointers node."); + OpTE->reorderOperands(Mask); + reorderScalars(OpTE->Scalars, Mask); + } + } + SVLoadsData.second->reorderOperands(Mask); + assert(SVLoadsData.second->ReorderIndices.empty() && + "Expected empty reorder sequence."); + reorderScalars(SVLoadsData.second->Scalars, Mask); + } } void BoUpSLP::buildExternalUses( @@ -3472,71 +3646,124 @@ if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); + // Try to vectorize gathered loads if this is not just a gather of loads. + if (!GatheredLoads.empty() && + !(VectorizableTree.size() == 2 && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.back()->State == TreeEntry::NeedToGather && + VectorizableTree.back()->getOpcode() != Instruction::Load)) + tryToVectorizeGatheredLoads(); } -namespace { -/// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { Gather, Vectorize, ScatterVectorize }; -} // anonymous namespace - -/// Checks if the given array of loads can be represented as a vectorized, -/// scatter or just simple gather. -static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, - const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl &Order, - SmallVectorImpl &PointerOps) { - // Check that a vectorized load would load the same memory as a scalar - // load. For example, we don't want to vectorize loads that are smaller - // than 8-bit. Even though we have a packed struct {} LLVM - // treats loading/storing it as an i8 struct. If we vectorize loads/stores - // from such a struct, we read/write packed bits disagreeing with the - // unvectorized version. - Type *ScalarTy = VL0->getType(); +void BoUpSLP::tryToVectorizeGatheredLoads() { + GatheredLoadsEntriesFirst = VectorizableTree.size(); - if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) - return LoadsState::Gather; + // Sort loads by distance. + auto &&LoadSorter = [](const std::pair &L1, + const std::pair &L2) { + return L1.second > L2.second; + }; - // Make sure all loads in the bundle are simple - we can't vectorize - // atomic or volatile loads. - PointerOps.clear(); - PointerOps.resize(VL.size()); - auto *POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast(V); - if (!L->isSimple()) - return LoadsState::Gather; - *POIter = L->getPointerOperand(); - ++POIter; + for (MutableArrayRef> LoadsDists : GatheredLoads) { + sort(LoadsDists, LoadSorter); + SmallVector Loads(LoadsDists.size()); + transform(LoadsDists, Loads.begin(), + [](const std::pair &L) { return L.first; }); + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + for (int NumElts = PowerOf2Floor(Loads.size()); NumElts > 1; NumElts /= 2) { + for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; + ++Cnt) { + ArrayRef Slice = makeArrayRef(Loads).slice(Cnt, NumElts); + if (VectorizedLoads.count(Slice.front()) || + VectorizedLoads.count(Slice.back())) + continue; + // Check if it is profitable to try vectorizing gathered loads. It is + // profitable if we have more than 4 consecutive loads or if we have + // less but all users are vectorized or deleted. + if (NumElts >= 4 || + (NumElts == 2 && + (all_of(Slice, + [this](LoadInst *LI) { + return LI->hasOneUse() || + (std::distance(LI->user_begin(), + LI->user_end()) == + LI->getNumUses() && + all_of(LI->users(), [this](User *U) { + return (isa(U) && + isDeleted(cast(U))) || + getTreeEntry(U); + })); + }))) || + any_of(VectorizableTree, + [Slice](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && + TE->Scalars.size() == 2 && + (equal(TE->Scalars, Slice) || + equal(TE->Scalars, reverse(Slice))); + })) { + SmallVector PointerOps; + OrdersType CurrentOrder; + // Try to build vector load. + ArrayRef Values( + reinterpret_cast(Slice.begin()), Slice.size()); + if (canVectorizeLoads(Values, Slice.front(), *TTI, *DL, *SE, + CurrentOrder, + PointerOps) != LoadsState::Gather) { + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads (" + << NumElts << ")\n"); + + buildTree_rec(Values, 0, EdgeInfo()); + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += NumElts; + Cnt += NumElts - 1; + continue; + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Loads.size()) + break; + } + } } +} - Order.clear(); - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; +/// Tries to find subvector of loads and builds new vector of only loads if can +/// be profitable. +static void gatherPossiblyVectorizableLoads( + const BoUpSLP &R, ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, + SmallVectorImpl>> &GatheredLoads) { + for (Value *V : VL) { + if (auto *LI = dyn_cast(V)) { + if (!R.isDeleted(LI) && isValidElementType(LI->getType())) { + bool IsFound = false; + for (auto &Data : GatheredLoads) { + if (LI->getParent() != Data.front().first->getParent()) + continue; + Optional Dist = + getPointersDiff(LI->getType(), LI->getPointerOperand(), + Data.front().first->getType(), + Data.front().first->getPointerOperand(), DL, SE, + /*StrictCheck=*/true); + if (Dist && + all_of(Data, [&Dist](const std::pair &Pair) { + return Pair.second != *Dist; + })) { + Data.emplace_back(LI, *Dist); + IsFound = true; + break; + } + } + if (!IsFound) + GatheredLoads.emplace_back().emplace_back(LI, 0); + } } - Optional Diff = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); - // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) - return LoadsState::Vectorize; - Align CommonAlignment = cast(VL0)->getAlign(); - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast(V)->getAlign()); - if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) - return LoadsState::ScatterVectorize; } - - return LoadsState::Gather; } void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, @@ -3574,6 +3801,8 @@ !isConstant(V); })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return false; @@ -3584,8 +3813,20 @@ }; InstructionsState S = getSameOpcode(VL); + // Don't vectorize ephemeral values. + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V + << ") is ephemeral.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return; + } + } + if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -3621,11 +3862,15 @@ // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || + bool IsSplat = isSplat(VL); + if (allConstant(VL) || IsSplat || !allSameBlock(VL) || !S.getOpcode() || + (S.getOpcode() == Instruction::Load && UserTreeIdx.UserTE) || (isa(S.MainOp) && !all_of(VL, isVectorLikeInstWithConstOps))) { + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - if (TryToFindDuplicates(S)) + if (IsSplat || TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; @@ -3634,16 +3879,6 @@ // We now know that this is a vector of instructions of the same type from // the same block. - // Don't vectorize ephemeral values. - for (Value *V : VL) { - if (EphValues.count(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is ephemeral.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - } - // Check if this is a duplicate of another entry. if (TreeEntry *E = getTreeEntry(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); @@ -3925,6 +4160,8 @@ else LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); #endif // NDEBUG + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); break; } return; @@ -4601,6 +4838,54 @@ } } +SmallVector BoUpSLP::tryToMatchVector( + const TreeEntry *E, + function_ref)> Callback) { + ArrayRef VL = E->Scalars; + // Check if these gather loads are part of the vectorized wide loads. + MapVector> Masks; + SmallVector GatheredVectors(VL.size(), + PoisonValue::get(VL.front()->getType())); + for (int I = 0, End = VL.size(); I < End; ++I) { + if (TreeEntry *TE = getTreeEntry(VL[I])) { + auto It = Masks.find(TE); + if (It == Masks.end()) { + It = Masks + .insert(std::make_pair( + TE, SmallVector(VL.size(), UndefMaskElem))) + .first; + } + It->second[I] = TE->findLaneForValue(VL[I]); + } else { + GatheredVectors[I] = VL[I]; + } + } + // Perform callback transformation/analysis on the built masks/entries. + if (!Masks.empty()) { + if (Masks.size() == 1) { + Callback(Masks.begin()->first, Masks.begin()->second); + } else { + Callback(Masks.begin()->first, None); + SmallVector &Mask = Masks.begin()->second; + bool AdjustMask = false; + for (const auto &Data : drop_begin(Masks)) { + for (int I = 0, End = Data.second.size(); I < End; ++I) { + int Idx = Data.second[I]; + if (Idx != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Mask is used already."); + Mask[I] = End + Idx; + } else if (AdjustMask && Mask[I] != UndefMaskElem) { + Mask[I] = I; + } + } + Callback(Data.first, Mask); + AdjustMask = true; + } + } + } + return GatheredVectors; +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -4710,157 +4995,225 @@ return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + // Also, check if the gathered loads can be shuffled out of final vector + // loads. + InstructionCost GatherCost = 0; + // Check if these gathers are part of the vectorized nodes. + bool MultipleShuffles = false; + SmallVector Gathers = + tryToMatchVector(E, [this, VecTy, &MultipleShuffles, + &GatherCost](TreeEntry *TE, ArrayRef Mask) { + if (Mask.empty()) { + // This is the first element of multiple (>2) shuffles of loads. + MultipleShuffles = true; + } else if (!MultipleShuffles) { + int Limit = TE->Scalars.size() * 2; + if (TE->Scalars.size() != Mask.size() || + (all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(Mask))) { + // FIXME: Replace with SK_InsertSubvector/SK_ExtractSubvector once + // it is properly supported. + int Index; + int NumElts; + if ((TE->Scalars.size() > Mask.size() && + ShuffleVectorInst::isExtractSubvectorMask( + Mask, TE->Scalars.size(), Index) && + Index % Mask.size() != 0) || + (TE->Scalars.size() < Mask.size() && + ShuffleVectorInst::isInsertSubvectorMask( + Mask, TE->Scalars.size(), NumElts, Index) && + Index % NumElts != 0)) { + GatherCost += + TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask); + } + } + } else { + GatherCost += + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask); + } + }); + BoUpSLP::ValueSet VectorizedScalars; + unsigned StartIdx = 0; + unsigned VF = Gathers.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = + DL->getTypeSizeInBits(Gathers.front()->getType()); + for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = Gathers.size(); + Cnt + VF <= End; Cnt += VF) { + ArrayRef Slice = makeArrayRef(Gathers).slice(Cnt, VF); + if (!VectorizedScalars.count(Slice.front()) && + !VectorizedScalars.count(Slice.back()) && allSameBlock(Slice) && + all_of(Slice, [](const Value *V) { return isa(V); })) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, + *SE, CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedScalars.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Gathers.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedScalars.empty()) + break; + } + if (!VectorizedScalars.empty()) { + unsigned NumParts = TTI->getNumberOfParts(VecTy); + bool NeedInsertSubvectorAnalysis = + !NumParts || (VL.size() / VF) > NumParts; + // Remove vectorized loads from the gather list. + for (unsigned I = 0, End = Gathers.size(); I < End; I += VF) { + if (VectorizedScalars.contains(Gathers[I])) + for (unsigned K = 0; K < VF; ++K) + Gathers[I + K] = PoisonValue::get(ScalarTy); + } + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedScalars) { + auto *LI = cast(V); + ScalarsCost += TTI->getMemoryOpCost( + Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace(), CostKind, LI); + } + auto *LI = cast(*VectorizedScalars.begin()); + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, LI); + GatherCost += ScatterVectorizeCnt * + TTI->getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + GatherCost -= ScalarsCost; + if (NeedInsertSubvectorAnalysis) { + // Add the cost for the subvectors insert. + for (int I = VF, E = VL.size(); I < E; I += VF) + GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, + None, I, LoadTy); + } + } + // Check if need to shuffle 2 vectors - gathered subvector and the shuffled + // vector. + bool HasShuffledGathers = + Gathers != VL && !all_of(Gathers, UndefValue::classof); + // if (Gathers != VL && !all_of(Gathers, UndefValue::classof)) + // GatherCost += TTI->getShuffleCost(TTI::SK_Select, VecTy); SmallVector Mask; SmallVector Entries; Optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); + isGatherShuffledEntry(E, Gathers, Mask, Entries); if (Shuffle.hasValue()) { - InstructionCost GatherCost = 0; + // Need to shuffle 2 vectors - gathered subvector and the shuffled vector. + if (HasShuffledGathers) + GatherCost += TTI->getShuffleCost(TTI::SK_Select, VecTy); + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + Gathers[I] = PoisonValue::get(ScalarTy); + } if (ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( dbgs() << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); + << *Gathers.front() << ".\n"); if (NeedToShuffleReuses) - GatherCost = + GatherCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); } else { LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle that starts with " - << *VL.front() << ".\n"); + << *Gathers.front() << ".\n"); // Detected that instead of gather we can emit a shuffle of single/two // previously vectorized nodes. Add the cost of the permutation rather // than gather. ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + GatherCost += TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); } - return GatherCost; } if ((E->getOpcode() == Instruction::ExtractElement || - all_of(E->Scalars, + all_of(Gathers, [](Value *V) { return isa(V); })) && - allSameType(VL)) { + allSameType(Gathers)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector Mask; Optional ShuffleKind = - isFixedVectorShuffle(VL, Mask); + isFixedVectorShuffle(Gathers, Mask); if (ShuffleKind.hasValue()) { + // Need to shuffle 2 vectors - gathered subvector and the shuffled + // vector. + if (HasShuffledGathers) + GatherCost += TTI->getShuffleCost(TTI::SK_Select, VecTy); // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. InstructionCost Cost = - computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); + computeExtractCost(Gathers, VecTy, *ShuffleKind, Mask, *TTI); AdjustExtractsCost(Cost); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - return Cost; + return GatherCost + Cost; } } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - assert(VecTy == FinalVecTy && - "No reused scalars expected for broadcast."); - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); - } InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) + if (!Shuffle.hasValue() && NeedToShuffleReuses) { ReuseShuffleCost = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - if (VL.size() > 2 && E->getOpcode() == Instruction::Load && - !E->isAltShuffle()) { - BoUpSLP::ValueSet VectorizedLoads; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); - for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, - *SE, CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; - else - ++ScatterVectorizeCnt; - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - InstructionCost GatherCost = 0; - unsigned NumParts = TTI->getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += getGatherCost(VL.slice(I, VF)); - } - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += TTI->getMemoryOpCost( - Instruction::Load, LI->getType(), LI->getAlign(), - LI->getPointerAddressSpace(), CostKind, LI); - } - auto *LI = cast(E->getMainOp()); - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, LI); - GatherCost += ScatterVectorizeCnt * - TTI->getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - for (int I = VF, E = VL.size(); I < E; I += VF) - GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - None, I, LoadTy); - } - return ReuseShuffleCost + GatherCost - ScalarsCost; - } + if (!VectorizedScalars.empty()) + GatherCost += ReuseShuffleCost; + } + if (HasShuffledGathers && any_of(Gathers, [](Value *V) { + return !isa(V) && isConstant(V); + })) { + // Final permute with the vector of scalars. + GatherCost += TTI->getShuffleCost(TTI::SK_Select, VecTy); + } + if (all_of(Gathers, isConstant)) + return GatherCost; + if (isSplat(Gathers) && (Gathers == VL || VL.size() > 2)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + assert((Gathers != VL || VecTy == FinalVecTy) && + "No reused scalars expected for broadcast."); + return GatherCost + + (Gathers == VL ? 0 + : TTI->getShuffleCost( + TargetTransformInfo::SK_Select, VecTy)) + + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } - return ReuseShuffleCost + getGatherCost(VL); + if (Gathers != VL) + return GatherCost + getGatherCost(Gathers); + return GatherCost + ReuseShuffleCost + getGatherCost(Gathers); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -5613,6 +5966,21 @@ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I].get(); + // Exclude cost of gather loads nodes which are not used. These nodes were + // built as part of the final attempt to vectorize gathered loads. + if (GatheredLoadsEntriesFirst >= 0 && + I >= static_cast(GatheredLoadsEntriesFirst) && + TE.State == TreeEntry::NeedToGather) { + assert(all_of(TE.Scalars, + [this](Value *V) { + return (isa(V) && MustGather.contains(V)) || + isa(V) || + V->getType()->isPtrOrPtrVectorTy(); + }) && + "Expected loads, pointers or constants only."); + continue; + } + InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C @@ -5810,11 +6178,12 @@ } Optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef Scalars, + SmallVectorImpl &Mask, SmallVectorImpl &Entries) { // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - Mask.assign(TE->Scalars.size(), UndefMaskElem); + Mask.assign(Scalars.size(), UndefMaskElem); Entries.clear(); // Build a lists of values to tree entries. DenseMap> ValueToTEs; @@ -5835,7 +6204,7 @@ // have a permutation of 2 input vectors. SmallVector> UsedTEs; DenseMap UsedValuesEntry; - for (Value *V : TE->Scalars) { + for (Value *V : Scalars) { if (isa(V)) continue; // Build a list of tree entries where V is used. @@ -5883,11 +6252,17 @@ } } + if (UsedTEs.empty()) { + assert(all_of(Scalars, UndefValue::classof) && + "Expected vector of undefs only."); + return None; + } + unsigned VF = 0; if (UsedTEs.size() == 1) { // Try to find the perfect match in another gather node at first. - auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { - return EntryPtr->isSame(TE->Scalars); + auto It = find_if(UsedTEs.front(), [Scalars](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(Scalars); }); if (It != UsedTEs.front().end()) { Entries.push_back(*It); @@ -5918,8 +6293,8 @@ } // Build a shuffle mask for better cost estimation and vector emission. - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { - Value *V = TE->Scalars[I]; + for (int I = 0, E = Scalars.size(); I < E; ++I) { + Value *V = Scalars[I]; if (isa(V)) continue; unsigned Idx = UsedValuesEntry.lookup(V); @@ -6247,56 +6622,15 @@ } } - // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - if (VL.size() > 2) { - DenseMap UniquePositions; - unsigned NumValues = - std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) { - return !isa(V); - }).base()); - VF = std::max(VF, PowerOf2Ceil(NumValues)); - int UniqueVals = 0; - for (Value *V : VL.drop_back(VL.size() - VF)) { - if (isa(V)) { - ReuseShuffleIndicies.emplace_back(UndefMaskElem); - continue; - } - if (isConstant(V)) { - ReuseShuffleIndicies.emplace_back(UniqueValues.size()); - UniqueValues.emplace_back(V); - continue; - } - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) { - UniqueValues.emplace_back(V); - ++UniqueVals; - } - } - if (UniqueVals == 1 && UniqueValues.size() == 1) { - // Emit pure splat vector. - ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), - UndefMaskElem); - } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { - ReuseShuffleIndicies.clear(); - UniqueValues.clear(); - UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); - } - UniqueValues.append(VF - UniqueValues.size(), - PoisonValue::get(VL[0]->getType())); - VL = UniqueValues; - } - - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, - CSEBlocks); - Value *Vec = gather(VL); - if (!ReuseShuffleIndicies.empty()) { - ShuffleBuilder.addMask(ReuseShuffleIndicies); - Vec = ShuffleBuilder.finalize(Vec); - } - return Vec; + auto *I = + find_if(VectorizableTree, [VL](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && + (TE->ReuseShuffleIndices.empty() || + TE->ReuseShuffleIndices.size() == VL.size()) && + TE->isSame(VL); + }); + assert(I != VectorizableTree.end() && "Gather node is not in the graph."); + return vectorizeTree(I->get()); } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -6312,27 +6646,427 @@ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, CSEBlocks); if (E->State == TreeEntry::NeedToGather) { - if (E->getMainOp()) + if (allConstant(E->Scalars)) { + Value *Vec = gather(E->Scalars); + E->VectorizedValue = Vec; + return Vec; + } + // Checks if the mask is an identity mask. + auto &&IsIdentityMask = [](ArrayRef Mask, FixedVectorType *VecTy) { + int Limit = Mask.size(); + return VecTy->getNumElements() == Mask.size() && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask); + }; + // Tries to combine 2 different masks into single one. + auto &&CombineMasks = [](SmallVectorImpl &Mask, ArrayRef ExtMask) { + SmallVector NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + NewMask[I] = Mask[ExtMask[I]]; + } + Mask.swap(NewMask); + }; + // Smart shuffle instruction emission, walks through shuffles trees and + // tries to find the best matching vector for the actual shuffle + // instruction. + auto &&CreateShuffle = [this, &IsIdentityMask, + &CombineMasks](Value *V1, Value *V2, + ArrayRef Mask) -> Value * { + assert(V1 && "Expected at least one vector value."); + if (V2 && !isUndefVector(V2)) { + Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + if (isa(V1)) + return PoisonValue::get(FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size())); + Value *Op = V1; + SmallVector CombinedMask(Mask.begin(), Mask.end()); + while (auto *SV = dyn_cast(Op)) { + // Exit if not a fixed vector type or changing size shuffle. + if (!isa(SV->getType())) + break; + // Exit if the identity or broadcast mask is found. + if (IsIdentityMask(CombinedMask, cast(SV->getType()))) + break; + bool IsOp1Undef = isUndefVector(SV->getOperand(0)); + bool IsOp2Undef = isUndefVector(SV->getOperand(1)); + if (!IsOp1Undef && !IsOp2Undef) + break; + SmallVector ShuffleMask(SV->getShuffleMask().begin(), + SV->getShuffleMask().end()); + CombineMasks(ShuffleMask, CombinedMask); + CombinedMask.swap(ShuffleMask); + if (IsOp2Undef) + Op = SV->getOperand(0); + else + Op = SV->getOperand(1); + } + if (!isa(Op->getType()) || + !IsIdentityMask(CombinedMask, cast(Op->getType()))) { + Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + return Op; + }; + + // Can set insert point safely on for the initial gather node. + if (E == VectorizableTree.front().get() && E->getMainOp()) setInsertPointAfterBundle(E); - Value *Vec; + auto &&SetInsertPoint = [this](Value *Vec1, Value *Vec2 = nullptr) { + Instruction *LastInst = nullptr; + auto *IVec1 = dyn_cast(Vec1); + auto *IVec2 = dyn_cast_or_null(Vec2); + if (IVec1 && IVec2) { + if (IVec1->getParent() != Builder.GetInsertBlock() && + IVec2->getParent() != Builder.GetInsertBlock()) + return; + LastInst = ((IVec1->getParent() != IVec2->getParent() && + IVec1->getParent() == Builder.GetInsertBlock()) || + (IVec1->getParent() == IVec2->getParent() && + IVec2->comesBefore(IVec1))) + ? IVec1 + : IVec2; + } else if (IVec1) { + LastInst = IVec1; + } else if (IVec2) { + LastInst = IVec2; + } else { + return; + } + // Set the insertion point after the last instruction in the bundle. Set + // the debug location to Front. + Builder.SetInsertPoint(LastInst->getParent(), ++LastInst->getIterator()); + Builder.SetCurrentDebugLocation(LastInst->getDebugLoc()); + }; + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); + Value *Vec = nullptr; + // Also, check if the gathered loads can be shuffled out of final vector + // loads. + std::pair> SingleVecMask; + SingleVecMask.first = nullptr; + // Check if these gathers are part of the vectorized nodes. + bool MultipleShuffles = false; + SmallVector GatheredScalars = tryToMatchVector( + E, [this, &SetInsertPoint, &MultipleShuffles, &SingleVecMask, &Vec, + &CreateShuffle](TreeEntry *TE, ArrayRef Mask) { + if (Mask.empty()) { + // This is the first element of multiple (>2) shuffles of loads. + MultipleShuffles = true; + Vec = vectorizeTree(TE); + } else if (!MultipleShuffles) { + SingleVecMask.first = vectorizeTree(TE); + SingleVecMask.second.assign(Mask.begin(), Mask.end()); + } else { + unsigned VecVF = + cast(Vec->getType())->getNumElements(); + unsigned VecValVF = + cast(TE->VectorizedValue->getType()) + ->getNumElements(); + if (VecVF == VecValVF && VecVF != Mask.size()) { + // Adjust mask and generate smaller shuffle. + SmallVector AdjustedMask(Mask.begin(), Mask.end()); + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (AdjustedMask[I] >= Sz) { + AdjustedMask[I] -= Sz; + AdjustedMask[I] += VecVF; + } + } + Value *TEVec = vectorizeTree(TE); + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, TEVec); + Vec = CreateShuffle(Vec, TEVec, AdjustedMask); + } else if (VecVF != VecValVF) { + Value *TEVec = vectorizeTree(TE); + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, TEVec); + SmallVector ExpandMask(Mask.size(), UndefMaskElem); + std::iota(ExpandMask.begin(), + std::next(ExpandMask.begin(), VecValVF), 0); + if (VecVF < VecValVF) + Vec = CreateShuffle(Vec, nullptr, ExpandMask); + else + TEVec = CreateShuffle(TEVec, nullptr, ExpandMask); + Vec = CreateShuffle(Vec, TEVec, Mask); + } else { + Value *TEVec = vectorizeTree(TE); + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, TEVec); + Vec = CreateShuffle(Vec, TEVec, Mask); + } + } + }); SmallVector Mask; - SmallVector Entries; - Optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (!all_of(GatheredScalars, PoisonValue::classof)) { + Type *ScalarTy = GatheredScalars.front()->getType(); + SmallVector Entries; + Optional Shuffle = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (Shuffle.hasValue()) { + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + if (Entries.size() == 1) { + if (!SingleVecMask.first) { + int Limit = Mask.size() * 2; + if (cast( + Entries.front()->VectorizedValue->getType()) + ->getNumElements() != Mask.size() || + (all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(Mask))) { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Entries.front()->VectorizedValue); + Vec = CreateShuffle(Entries.front()->VectorizedValue, nullptr, + Mask); + } else { + Vec = Entries.front()->VectorizedValue; + } + } else { + // Combine single masks into one common. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Element already used."); + Mask[I] = SingleVecMask.second[I]; + } else if (Mask[I] != UndefMaskElem) { + Mask[I] += Sz; + } + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first, + Entries.front()->VectorizedValue); + Vec = CreateShuffle(SingleVecMask.first, + Entries.front()->VectorizedValue, Mask); + SingleVecMask.first = nullptr; + } + } else { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue); + Vec = CreateShuffle(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + if (SingleVecMask.first) { + // Combine masks into one common. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Element already used."); + Mask[I] = SingleVecMask.second[I]; + } else if (Mask[I] != UndefMaskElem) { + Mask[I] = I + Sz; + } + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + Vec = CreateShuffle(SingleVecMask.first, Vec, Mask); + SingleVecMask.first = nullptr; + } + } + } else { + // Check that every instruction appears once in this bundle. + SmallVector UniqueValues; + if (GatheredScalars.size() > 2 && !NeedToShuffleReuses) { + DenseMap UniquePositions; + int UniqueVals = 0; + for (Value *V : GatheredScalars) { + if (isa(V) && !isa(V)) { + ReuseShuffleIndicies.emplace_back(UndefMaskElem); + continue; + } + if (isConstant(V)) { + ReuseShuffleIndicies.emplace_back(UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(Res.first->second); + if (Res.second) { + UniqueValues.emplace_back(V); + ++UniqueVals; + } + } + if (UniqueVals == 1 && UniqueValues.size() == 1) { + // Emit pure splat vector. + ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), + UndefMaskElem); + } else if (UniqueValues.size() >= VF - 1 || + UniqueValues.size() <= 1) { + ReuseShuffleIndicies.clear(); + UniqueValues.swap(GatheredScalars); + } + UniqueValues.append(VF - UniqueValues.size(), + PoisonValue::get(ScalarTy)); + GatheredScalars.swap(UniqueValues); + } + } + } + + Value *VecVal = nullptr; + if ((E->getOpcode() == Instruction::ExtractElement || + all_of(GatheredScalars, + [](Value *V) { + return isa(V); + })) && + allSameType(GatheredScalars)) { + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + SmallVector Mask; + Optional ShuffleKind = + isFixedVectorShuffle(GatheredScalars, Mask); + if (ShuffleKind.hasValue()) { + // Find input vectors. + Value *Vec1 = nullptr; + Value *Vec2 = nullptr; + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] == UndefMaskElem) + continue; + auto *EI = cast(GatheredScalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } + } + if (Vec2) { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec1, Vec2); + VecVal = CreateShuffle(Vec1, Vec2, Mask); + } else if (!Vec1) { + // If extracts are all from undef vectors - just build poison vector. + VecVal = PoisonValue::get(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + } else if (GatheredScalars.size() != + cast(Vec1->getType()) + ->getNumElements() || + !ShuffleVectorInst::isIdentityMask(Mask)) { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec1); + VecVal = CreateShuffle(Vec1, nullptr, Mask); + } + } + } + if (!VecVal) { + IRBuilder<>::InsertPointGuard Guard(Builder); + auto &&SetInsertPointAfterOps = [this](ArrayRef VL) { + // The last instruction in the bundle in program order. + Instruction *LastInst = nullptr; + + for (Value *V : VL) { + // If the value was vectorized, need to get the vector value for + // correct insert point. + if (const TreeEntry *TE = getTreeEntry(V)) + if (TE->VectorizedValue) + V = TE->VectorizedValue; + auto *I = dyn_cast(V); + if (!I) + continue; + if (!DT->isReachableFromEntry(I->getParent())) + continue; + if (!LastInst) { + LastInst = I; + continue; + } + if ((LastInst->getParent() != I->getParent() && + DT->dominates(LastInst->getParent(), I->getParent())) || + (LastInst->getParent() == I->getParent() && + LastInst->comesBefore(I))) + LastInst = I; + } + // Set the insertion point after the last instruction in the bundle. + // Set the debug location to Front. + if (!LastInst) + return; + if (isa(LastInst)) + Builder.SetInsertPoint(LastInst->getParent(), + LastInst->getParent()->getFirstInsertionPt()); + else + Builder.SetInsertPoint(LastInst->getParent(), + std::next(LastInst->getIterator())); + Builder.SetCurrentDebugLocation(LastInst->getDebugLoc()); + }; + SetInsertPointAfterOps(GatheredScalars); + VecVal = gather(GatheredScalars); + } + bool NeedToShuffleReuses = true; + if (Vec) { + assert(!SingleVecMask.first && "Unexpected single vectorized value."); + if (!all_of(GatheredScalars, UndefValue::classof)) { + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (!isa(GatheredScalars[I])) + Mask[I] = I + Sz; + else + Mask[I] = I; + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, VecVal); + Vec = CreateShuffle(Vec, VecVal, Mask); + } + } else if (SingleVecMask.first) { + if (all_of(GatheredScalars, UndefValue::classof)) { + int Limit = SingleVecMask.second.size() * 2; + if (cast(SingleVecMask.first->getType()) + ->getNumElements() != SingleVecMask.second.size() || + (all_of(SingleVecMask.second, + [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(SingleVecMask.second))) { + if (!ReuseShuffleIndicies.empty()) { + ShuffleBuilder.addMask(SingleVecMask.second); + ShuffleBuilder.addMask(ReuseShuffleIndicies); + Vec = ShuffleBuilder.finalize(SingleVecMask.first); + NeedToShuffleReuses = false; + } else { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + Vec = CreateShuffle(SingleVecMask.first, nullptr, + SingleVecMask.second); + } + } else { + Vec = SingleVecMask.first; + } + } else { + Value *AdjustedVec; + if (cast(SingleVecMask.first->getType()) + ->getNumElements() != GatheredScalars.size()) { + // Adjust vector value and generate smaller shuffle. + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + AdjustedVec = + CreateShuffle(SingleVecMask.first, nullptr, SingleVecMask.second); + for (int I = 0, Sz = SingleVecMask.second.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) + SingleVecMask.second[I] = I; + } + } else { + AdjustedVec = SingleVecMask.first; + } + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (!isa(GatheredScalars[I])) + SingleVecMask.second[I] = I + Sz; + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(AdjustedVec, VecVal); + Vec = CreateShuffle(AdjustedVec, VecVal, SingleVecMask.second); } } else { - Vec = gather(E->Scalars); + Vec = VecVal; } - if (NeedToShuffleReuses) { - ShuffleBuilder.addMask(E->ReuseShuffleIndices); + if (NeedToShuffleReuses && !ReuseShuffleIndicies.empty()) { + ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); } E->VectorizedValue = Vec; @@ -6887,6 +7621,14 @@ scheduleBlock(BSIter.second.get()); } + // Need to vectorize gathered loads independently since there are no direct + // users, only indirect ones, represented by gathered nodes. + for (const std::unique_ptr &TE : VectorizableTree) { + if (TE->getOpcode() == Instruction::Load && + TE->State != TreeEntry::NeedToGather && TE->UserTreeIndices.empty() && + TE.get() != VectorizableTree.front().get()) + TE->VectorizedValue = vectorizeTree(TE.get()); + } Builder.SetInsertPoint(&F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -308,12 +308,12 @@ define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @reverse_hadd_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -41,7 +41,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; @@ -279,15 +279,15 @@ ; CHECK-LABEL: @cmp_lt_gt( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i64 1 ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -18,19 +18,18 @@ ; NOACCELERATE-LABEL: @int_sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -214,19 +213,18 @@ ; NOACCELERATE-LABEL: @exp_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -297,19 +295,18 @@ ; NOACCELERATE-LABEL: @log_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -473,19 +470,18 @@ ; NOACCELERATE-LABEL: @sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -515,19 +511,18 @@ ; NOACCELERATE-LABEL: @cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -1006,19 +1001,18 @@ ; NOACCELERATE-LABEL: @int_cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -18,19 +18,18 @@ ; NOACCELERATE-LABEL: @int_sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -214,19 +213,18 @@ ; NOACCELERATE-LABEL: @exp_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -297,19 +295,18 @@ ; NOACCELERATE-LABEL: @log_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -473,19 +470,18 @@ ; NOACCELERATE-LABEL: @sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -515,19 +511,18 @@ ; NOACCELERATE-LABEL: @cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: @@ -1006,19 +1001,18 @@ ; NOACCELERATE-LABEL: @int_cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP4]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP5]], i32 3 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -159,18 +159,18 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ] ; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[T4]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[T4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]] ; CHECK-NEXT: [[T7:%.*]] = or i32 [[T4]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[T7]] to i64 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[T7]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP5]] ; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i64 0 ; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-2 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s @@ -17,7 +17,7 @@ ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '8' +; YAML-NEXT: - TreeSize: '10' define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) { ; CHECK-LABEL: @test_select( @@ -145,7 +145,7 @@ ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '3' +; YAML-NEXT: - TreeSize: '5' ; CHECK-LABEL: @reduction_with_br( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP_16:%.*]] = icmp sgt i32 [[H:%.*]], 0 @@ -246,7 +246,7 @@ ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-36' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '10' +; YAML-NEXT: - TreeSize: '12' define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 { ; CHECK-LABEL: @test_unrolled_select( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll @@ -10,23 +10,21 @@ ; CHECK-LABEL: @f_noalias( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16 ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCALE]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP6]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll @@ -12,47 +12,47 @@ ; CHECK-LABEL: @wrap_mul4( ; CHECK-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[TEMP:%.*]] = load double, double* [[ARRAYIDX1_I]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TEMP]], i32 1 ; CHECK-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 0, i64 1 ; CHECK-NEXT: [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP2]], i32 1 ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP7]], [[TMP10]] ; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2 ; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2 ; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* ; CHECK-NEXT: [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP2]], [[TMP13]] ; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* ; CHECK-NEXT: [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP4]], [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]] ; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0 ; CHECK-NEXT: [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8 -; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1 -; CHECK-NEXT: [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1 +; CHECK-NEXT: [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[TEMP11]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP6]], [[TMP20]] +; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP9]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP23]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]] -; CHECK-NEXT: [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP22]] ; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]] ; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>* diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll @@ -9,7 +9,7 @@ %add1 = fadd double %f1, %f1 %w0 = getelementptr inbounds double, double* %w, i64 0 %w1 = getelementptr inbounds double, double* %w, i64 1 -; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3 +; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 5 store double %add0, double* %w0, !dbg !9 store double %add1, double* %w1 ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll @@ -24,14 +24,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX43]] to <4 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> poison, <4 x double> zeroinitializer, <4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> zeroinitializer, <4 x double> [[TMP2]]) ; CHECK-NEXT: br label [[SW_EPILOG:%.*]] ; CHECK: sw.bb195: ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: do.body: ; CHECK-NEXT: unreachable ; CHECK: sw.epilog: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x double> [ poison, [[SW_BB195]] ], [ [[TMP3]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP3]], [[SW_BB]] ] ; CHECK-NEXT: ret i32 undef ; CHECK: if.end.1: ; CHECK-NEXT: br label [[FOR_COND15_1:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll @@ -15,13 +15,13 @@ ; CHECK-NEXT: [[CALL_I_I:%.*]] = call i32* @get_ptr() ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 2 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 2 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[GEP_1]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[RES]] to <2 x i64>* diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -134,14 +134,14 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -134,14 +134,14 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -13,16 +13,15 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 ; CHECK-NEXT: [[V2_LANE_3:%.*]] = extractelement <4 x double> [[V_2]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: call void @use(double [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP4]]) -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -54,17 +53,15 @@ ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V_3:%.*]] = load <2 x double>, <2 x double>* [[PTR_3:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x double> [[V_1]], <2 x double> [[V_3]], <2 x i32> ; CHECK-NEXT: [[V3_LANE_1:%.*]] = extractelement <2 x double> [[V_3]], i32 1 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V3_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V3_LANE_1]]) -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -95,19 +92,17 @@ ; CHECK-LABEL: @noop_extract_second_2_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, <4 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <4 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -137,16 +132,15 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: call void @use(double [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP5]]) -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -175,19 +169,17 @@ ; CHECK-LABEL: @extract_lanes_1_and_2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, <4 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <4 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) -; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -218,28 +210,24 @@ ; CHECK-LABEL: @noop_extracts_existing_vector_4_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_1]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[V2_LANE_0]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP7]], <9 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP3]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -274,29 +262,23 @@ ; CHECK-LABEL: @extracts_jumbled_4_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_1]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_3]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[V2_LANE_2]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[V2_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP9]], <9 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP3]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -333,6 +315,8 @@ ; CHECK-LABEL: @noop_extracts_9_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 @@ -343,41 +327,21 @@ ; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7 ; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_3]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_4]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_7]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_8]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_0]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_1]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <8 x double> [[TMP0]], [[SHUFFLE1]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_1]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_2]], i32 5 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_3]], i32 6 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_4]], i32 7 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V2_LANE_0]], i32 2 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP5]], double [[A_LANE_8]], i32 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP1]], [[SHUFFLE]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP25]], double [[B_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]] ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void @@ -450,6 +414,8 @@ ; CHECK-LABEL: @first_mul_chain_jumbled( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 @@ -460,37 +426,19 @@ ; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7 ; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_6]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_5]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_0]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_2]], i32 2 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fmul <8 x double> [[TMP0]], [[SHUFFLE1]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_1]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_2]], i32 5 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_3]], i32 6 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_4]], i32 7 -; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP4]], double [[A_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <8 x double> [[TMP1]], [[SHUFFLE1]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP22]], double [[B_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP6]], double [[B_LANE_8]], i32 8 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]] ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void @@ -563,6 +511,8 @@ ; CHECK-LABEL: @first_and_second_mul_chain_jumbled( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 @@ -573,41 +523,21 @@ ; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7 ; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <8 x double> [[TMP0]], [[SHUFFLE1]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_7]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_6]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_1]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_0]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_3]], i32 5 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_2]], i32 6 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_5]], i32 7 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V2_LANE_0]], i32 2 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP5]], double [[A_LANE_8]], i32 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP1]], [[SHUFFLE]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]] -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP25]], double [[B_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]] ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -244,12 +244,12 @@ ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[INS_11:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_11]], i16 [[ADD_2]], i64 2 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> +; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; bb: @@ -291,14 +291,14 @@ ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX8-NEXT: [[INS_32:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> -; GFX8-NEXT: ret <4 x i16> [[INS_32]] +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -244,12 +244,12 @@ ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[INS_11:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_11]], i16 [[ADD_2]], i64 2 +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> +; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; bb: @@ -291,14 +291,14 @@ ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX8-NEXT: [[INS_32:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> -; GFX8-NEXT: ret <4 x i16> [[INS_32]] +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll @@ -8,12 +8,10 @@ ; CHECK-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8 ; CHECK-NEXT: [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7 ; CHECK-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[ARG0_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 [[ARG0_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[ARG1_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[ARG1_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP3]]) -; CHECK-NEXT: ret <2 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x i16> [[ARG0]], <9 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1]], <9 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; CHECK-NEXT: ret <2 x i16> [[TMP2]] ; bb: %arg0.1 = extractelement <9 x i16> undef, i64 7 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -195,11 +195,11 @@ ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 ; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1 ; GCN-NEXT: [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2 -; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 -; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]]) +; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 +; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1 +; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* +; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2 +; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) ; GCN-NEXT: [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)* ; GCN-NEXT: store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2 ; GCN-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -10,15 +10,15 @@ ; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ADD277]], i32 1 ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1 ; CHECK-NEXT: [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]] +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], ; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -6,10 +6,10 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void @@ -35,10 +35,10 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void @@ -64,13 +64,13 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 ; CHECK-NEXT: store i64 [[TMP4]], i64* [[LD]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -9,8 +9,8 @@ ; CHECK: loop: ; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0 +; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll @@ -6,20 +6,21 @@ define { i64, i64 } @patatino(double %arg) { ; CHECK-LABEL: @patatino( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* bitcast ([6 x double]* @global to <4 x double>*), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[ARG]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = fptosi <2 x double> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sext <2 x i32> [[TMP9]] to <2 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 +; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1 +; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1 ; CHECK-NEXT: ret { i64, i64 } [[T17]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll @@ -6,12 +6,6 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[VECINS_I_5_I1:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll @@ -6,12 +6,6 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[VECINS_I_5_I1:%.*]] = shufflevector <8 x i32> undef, <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 | FileCheck %s --check-prefix=CHECK +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) { @@ -7,12 +7,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP0:%.*]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] ; CHECK-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]] ; CHECK-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]] @@ -39,12 +40,10 @@ ; CHECK-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] ; CHECK-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] ; CHECK-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_EXTRA26]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = and <2 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> , i32 [[OP_EXTRA26]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP6]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP2]] +; CHECK-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( @@ -90,9 +89,9 @@ ; FORCE_REDUCTION-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]] ; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 ; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]] -; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 ; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP7]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 ; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[VAL_41]], i32 0 ; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = and <2 x i32> [[TMP8]], [[TMP9]] ; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -340,24 +340,14 @@ define void @vec_shuff_reorder() #0 { ; CHECK-LABEL: @vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -11,12 +11,12 @@ ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -32,12 +32,12 @@ ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -53,12 +53,12 @@ ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -11,12 +11,12 @@ ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -32,12 +32,12 @@ ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -53,12 +53,12 @@ ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -163,10 +163,10 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: @sitofp_4i32_8i16( ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -201,20 +201,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> -; CHECK-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -163,10 +163,10 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: @sitofp_4i32_8i16( ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -201,20 +201,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> -; CHECK-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -98,7 +98,7 @@ ; SLM-LABEL: @fmul_fdiv_v4f32_const( ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -98,7 +98,7 @@ ; SLM-LABEL: @fmul_fdiv_v4f32_const( ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -170,9 +170,9 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], ; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R71]] @@ -230,8 +230,8 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> @@ -247,13 +247,13 @@ ; SSE-NEXT: ret <8 x i32> [[R71]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -261,13 +261,13 @@ ; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -275,13 +275,13 @@ ; AVX1-NEXT: ret <8 x i32> [[R71]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -289,13 +289,13 @@ ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -437,10 +437,10 @@ ; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> @@ -454,10 +454,10 @@ ; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -170,9 +170,9 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], ; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R71]] @@ -230,8 +230,8 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> @@ -247,13 +247,13 @@ ; SSE-NEXT: ret <8 x i32> [[R71]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -261,13 +261,13 @@ ; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -275,13 +275,13 @@ ; AVX1-NEXT: ret <8 x i32> [[R71]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -289,13 +289,13 @@ ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -437,10 +437,10 @@ ; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> @@ -454,10 +454,10 @@ ; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll @@ -623,33 +623,25 @@ ; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 ; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 ; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 -; SLM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; SLM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1 -; SLM-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1 -; SLM-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; SLM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1 -; SLM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 0 -; SLM-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1 -; SLM-NEXT: [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]] -; SLM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[A4]], i32 0 -; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1 -; SLM-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[B4]], i32 0 -; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1 -; SLM-NEXT: [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]] -; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[A6]], i32 0 -; SLM-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1 -; SLM-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[B6]], i32 0 -; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1 -; SLM-NEXT: [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]] -; SLM-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> -; SLM-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP23]], <8 x i32> -; SLM-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP24]], <8 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] +; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> +; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> +; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> ; SLM-NEXT: ret <8 x double> [[R73]] ; ; AVX-LABEL: @buildvector_div_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll @@ -623,33 +623,25 @@ ; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 ; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 ; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 -; SLM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; SLM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1 -; SLM-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1 -; SLM-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; SLM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1 -; SLM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 0 -; SLM-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1 -; SLM-NEXT: [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]] -; SLM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[A4]], i32 0 -; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1 -; SLM-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[B4]], i32 0 -; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1 -; SLM-NEXT: [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]] -; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[A6]], i32 0 -; SLM-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1 -; SLM-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[B6]], i32 0 -; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1 -; SLM-NEXT: [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]] -; SLM-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> -; SLM-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP23]], <8 x i32> -; SLM-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP24]], <8 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] +; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> +; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> +; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> ; SLM-NEXT: ret <8 x double> [[R73]] ; ; AVX-LABEL: @buildvector_div_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll @@ -16,10 +16,10 @@ ; CHECK-NEXT: [[A0:%.*]] = load i64, i64* [[A:%.*]], align 8 ; CHECK-NEXT: [[B0:%.*]] = load i64, i64* [[B:%.*]], align 8 ; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1 -; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0 @@ -70,15 +70,15 @@ ; CHECK-NEXT: [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8 ; CHECK-NEXT: [[E0:%.*]] = load i16, i16* [[E:%.*]], align 8 -; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] +; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP5]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1 ; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll @@ -16,6 +16,7 @@ ; CHECK-LABEL: @bcast_long( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A0:%.*]] = load i32, i32* [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[A0]], i32 0 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1 ; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2 @@ -24,7 +25,6 @@ ; CHECK-NEXT: [[IDXS5:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 5 ; CHECK-NEXT: [[IDXS6:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 6 ; CHECK-NEXT: [[IDXS7:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[A0]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IDXS0]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP1]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -242,15 +242,15 @@ ; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* ; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1 -; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[B0]], i64 1 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP7]], [[TMP4]] ; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> ; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> ; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> ; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> ; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> @@ -266,7 +266,7 @@ ; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 ; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] ; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] ; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -242,15 +242,15 @@ ; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* ; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1 -; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[B0]], i64 1 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP7]], [[TMP4]] ; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> ; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> ; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> ; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> ; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> @@ -266,7 +266,7 @@ ; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 ; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] ; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] ; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,9 +16,9 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 +; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 ; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer ; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] @@ -26,9 +26,9 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( -; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 ; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer ; AVX-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -58,11 +58,11 @@ ; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 ; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] ; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -11,11 +11,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> poison, double* [[TMP0]], i32 0 ; CHECK-NEXT: [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = load double*, double** [[_M_FIRST3_I_I83]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double** [[_M_CUR2_I_I81]] to <2 x double*>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double*>, <2 x double*>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double*> [[TMP3]], <2 x double*> [[TMP1]], <2 x i32> ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]] ; CHECK: while.cond.i.preheader: ; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] @@ -24,10 +24,9 @@ ; CHECK: while.body.i: ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]] ; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: -; CHECK-NEXT: [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double*> [ [[TMP4]], [[ENTRY:%.*]] ], [ [[TMP3]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double** [[_M_CUR2_I_I]] to <2 x double*>* +; CHECK-NEXT: store <2 x double*> [[TMP5]], <2 x double*>* [[TMP6]], align 8 ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] ; CHECK: if.then.i55: ; CHECK-NEXT: br label [[WHILE_COND]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -8,16 +8,18 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP9]], undef ; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP9]], undef ; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef @@ -27,16 +29,13 @@ ; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP10]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP8]], undef ; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB2:%.*]] ; CHECK-NEXT: ] @@ -45,7 +44,7 @@ ; CHECK: bb2: ; CHECK-NEXT: br label [[LABEL]] ; CHECK: label: -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15]], [[BB2]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB1]] ], [ [[TMP14]], [[BB2]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -129,11 +129,10 @@ ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[INBUF]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -35,10 +35,10 @@ ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* @@ -114,17 +114,9 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] ; CHECK: if.then38: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double undef, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]] ; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> undef, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: br label [[RETURN:%.*]] ; CHECK: if.then78: ; CHECK-NEXT: br label [[RETURN]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -68,15 +68,15 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A]] to <4 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP6]], align 8 @@ -200,15 +200,15 @@ ; CHECK-LABEL: @foo4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A]] to <4 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP6]], align 8 @@ -254,12 +254,12 @@ ; CHECK-LABEL: @partial_mrg( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[A]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[N]], 4 @@ -271,7 +271,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[N]], 4 ; CHECK-NEXT: [[CONV12:%.*]] = sitofp i32 [[ADD]] to double -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV12]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -16,16 +16,16 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 @@ -67,20 +67,20 @@ ; CHECK-LABEL: @extr_user( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: ret i32 [[TMP5]] ; entry: @@ -111,20 +111,20 @@ ; CHECK-LABEL: @extr_user1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: ret i32 [[TMP5]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll @@ -5,9 +5,9 @@ ; CHECK-LABEL: @diamond_broadcast( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll @@ -5,9 +5,9 @@ ; CHECK-LABEL: @diamond_broadcast( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 @@ -35,9 +35,9 @@ ; CHECK-LABEL: @diamond_broadcast2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 @@ -65,9 +65,9 @@ ; CHECK-LABEL: @diamond_broadcast3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll @@ -5,10 +5,9 @@ ; CHECK-LABEL: @g( ; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] -; CHECK-NEXT: ret <2 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <2 x i8> [[TMP2]] ; %x0 = extractelement <2 x i8> %x, i32 0 %y1 = extractelement <2 x i8> %y, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll @@ -5,10 +5,9 @@ ; CHECK-LABEL: @g( ; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] -; CHECK-NEXT: ret <2 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <2 x i8> [[TMP2]] ; %x0 = extractelement <2 x i8> %x, i32 0 %y1 = extractelement <2 x i8> %y, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -54,14 +54,11 @@ ; CHECK-LABEL: @fextr2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <4 x double>, <4 x double>* undef, align 32 -; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[P0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll @@ -15,12 +15,11 @@ define float @multi_uses(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @multi_uses( ; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x float> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Y1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[Y]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; CHECK-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -74,31 +74,27 @@ define float @f_used_twice_in_tree(<2 x float> %x) { ; CHECK-LABEL: @f_used_twice_in_tree( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 -; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X1]] -; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; CHECK-NEXT: ret float [[ADD]] ; ; THRESH1-LABEL: @f_used_twice_in_tree( -; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH1-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH1-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH1-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH1-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH1-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH1-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH1-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], [[X]] +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH1-NEXT: ret float [[ADD]] ; ; THRESH2-LABEL: @f_used_twice_in_tree( -; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH2-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH2-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH2-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], [[X]] +; THRESH2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH2-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll @@ -16,9 +16,9 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[N:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -21,18 +21,11 @@ ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7 -; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP9]]) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP12]], [[CONV]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 ; CHECK-NEXT: ret float [[OP_EXTRA1]] @@ -50,18 +43,11 @@ ; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 ; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 ; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0 -; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1 -; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2 -; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3 -; THRESHOLD-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4 -; THRESHOLD-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5 -; THRESHOLD-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6 -; THRESHOLD-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7 -; THRESHOLD-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]]) -; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP3]], <8 x i32> +; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; THRESHOLD-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP9]]) +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP12]], [[CONV]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; THRESHOLD-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 ; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -909,15 +909,15 @@ ; THRESH-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) -; THRESH-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]] -; THRESH-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP6]] -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]] -; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]] +; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; THRESH-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP6]] +; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP6]] +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP3]], i32 1 +; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP14]], [[TMP9]] +; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP14]], <2 x i32> [[TMP9]] ; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 ; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 ; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] @@ -1095,17 +1095,17 @@ ; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; THRESH-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; THRESH-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; THRESH-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> poison, i1 [[TMP12]], i32 0 -; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1 -; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 -; THRESH-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 -; THRESH-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]] +; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP7]] +; THRESH-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP7]] +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0 +; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP3]], i32 1 +; THRESH-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP13]], [[TMP8]] +; THRESH-NEXT: [[TMP17:%.*]] = insertelement <2 x i1> poison, i1 [[TMP16]], i32 0 +; THRESH-NEXT: [[TMP18:%.*]] = insertelement <2 x i1> [[TMP17]], i1 [[TMP5]], i32 1 +; THRESH-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP18]], <2 x i32> [[TMP15]], <2 x i32> [[TMP10]] ; THRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 ; THRESH-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 ; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] @@ -1476,8 +1476,8 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @PR49730( -; AVX-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> ) -; AVX-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]] +; AVX-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; AVX-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] ; AVX-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef ; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) ; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) @@ -1486,8 +1486,8 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @PR49730( -; AVX2-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> ) -; AVX2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]] +; AVX2-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; AVX2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] ; AVX2-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef ; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) ; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) @@ -1496,8 +1496,8 @@ ; AVX2-NEXT: ret void ; ; THRESH-LABEL: @PR49730( -; THRESH-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> ) -; THRESH-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]] +; THRESH-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; THRESH-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] ; THRESH-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef ; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) ; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll @@ -9,16 +9,13 @@ ; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 ; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 ; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP3]], <2 x float> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP6]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -144,21 +144,19 @@ ; MINTREESIZE-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 ; MINTREESIZE-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2 ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] -; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef @@ -289,25 +287,19 @@ ; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP5]], <2 x float> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP8]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[RD1]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -441,24 +433,12 @@ ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @reschedule_extract( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0 -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: ret <4 x float> [[TMP11]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: ret <4 x float> [[TMP2]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -491,24 +471,12 @@ ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @take_credit( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0 -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP17]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -565,40 +533,16 @@ ; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @_vadd256( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[B]], i32 6 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[B]], i32 5 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[B]], i32 4 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[B]], i32 3 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[B]], i32 2 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[B]], i32 1 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[B]], i32 0 -; MINTREESIZE-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = extractelement <8 x float> [[A]], i32 6 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[A]], i32 5 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = extractelement <8 x float> [[A]], i32 4 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[A]], i32 3 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = extractelement <8 x float> [[A]], i32 2 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = extractelement <8 x float> [[A]], i32 1 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = extractelement <8 x float> [[A]], i32 0 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP16]], i32 0 -; MINTREESIZE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP8]], i32 1 -; MINTREESIZE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0 -; MINTREESIZE-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP7]], i32 1 -; MINTREESIZE-NEXT: [[TMP21:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i32 0 -; MINTREESIZE-NEXT: [[TMP22:%.*]] = insertelement <2 x float> [[TMP21]], float [[TMP6]], i32 1 -; MINTREESIZE-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0 -; MINTREESIZE-NEXT: [[TMP24:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP5]], i32 1 -; MINTREESIZE-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0 -; MINTREESIZE-NEXT: [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 -; MINTREESIZE-NEXT: [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 -; MINTREESIZE-NEXT: [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP31:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0 -; MINTREESIZE-NEXT: [[TMP32:%.*]] = insertelement <2 x float> [[TMP31]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP33:%.*]] = fadd <8 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <8 x float> [[TMP33]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <8 x float> [[TMP9]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -179,21 +179,19 @@ ; MINTREESIZE-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 ; MINTREESIZE-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2 ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] -; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef @@ -324,25 +322,19 @@ ; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP5]], <2 x float> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP8]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[RD1]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -476,24 +468,12 @@ ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @reschedule_extract( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0 -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: ret <4 x float> [[TMP11]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: ret <4 x float> [[TMP2]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -526,24 +506,12 @@ ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @take_credit( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[B]], i32 1 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[B]], i32 0 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[A]], i32 2 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[A]], i32 1 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0 -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP17]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -600,40 +568,16 @@ ; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @_vadd256( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[B]], i32 6 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[B]], i32 5 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[B]], i32 4 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[B]], i32 3 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[B]], i32 2 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[B]], i32 1 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[B]], i32 0 -; MINTREESIZE-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = extractelement <8 x float> [[A]], i32 6 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[A]], i32 5 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = extractelement <8 x float> [[A]], i32 4 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[A]], i32 3 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = extractelement <8 x float> [[A]], i32 2 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = extractelement <8 x float> [[A]], i32 1 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = extractelement <8 x float> [[A]], i32 0 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP16]], i32 0 -; MINTREESIZE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP8]], i32 1 -; MINTREESIZE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i32 0 -; MINTREESIZE-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP7]], i32 1 -; MINTREESIZE-NEXT: [[TMP21:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i32 0 -; MINTREESIZE-NEXT: [[TMP22:%.*]] = insertelement <2 x float> [[TMP21]], float [[TMP6]], i32 1 -; MINTREESIZE-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0 -; MINTREESIZE-NEXT: [[TMP24:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP5]], i32 1 -; MINTREESIZE-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0 -; MINTREESIZE-NEXT: [[TMP26:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 -; MINTREESIZE-NEXT: [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 -; MINTREESIZE-NEXT: [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP31:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0 -; MINTREESIZE-NEXT: [[TMP32:%.*]] = insertelement <2 x float> [[TMP31]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP33:%.*]] = fadd <8 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <8 x float> [[TMP33]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <8 x float> [[TMP9]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -11,15 +11,15 @@ ; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_SW]], %struct.sw* [[V]], i64 0, i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[X]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* undef, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> poison, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], poison -; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], poison +; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> undef, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], undef +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], undef ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 ; CHECK-NEXT: [[VEC1:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -8,11 +8,10 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll @@ -52,7 +52,7 @@ ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ poison, [[ENTRY]] ], [ [[TMP26]], [[FOR_INC]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP26]], [[FOR_INC]] ] ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -17,15 +17,15 @@ ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i32> [[TMP2]], [[TMP5]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 @@ -69,22 +69,15 @@ ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -43,14 +43,13 @@ ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[SHUFFLE]], <4 x float> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fsub <4 x float> [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -54,15 +54,14 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I21]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[I3]] +; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[X2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[X2]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP6]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,15 +54,14 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I21]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[I3]] +; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[X2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[X2]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP6]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -27,19 +27,17 @@ ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <8 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -98,12 +96,12 @@ ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 @@ -161,23 +159,21 @@ ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <8 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -237,29 +233,29 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 1 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -335,17 +331,17 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 +; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* @@ -453,17 +449,15 @@ ; CHECK-LABEL: @ChecksExtractScores( ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 -; CHECK-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 -; CHECK-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 ; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* @@ -598,21 +592,19 @@ ; CHECK-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 ; CHECK-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 ; CHECK-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> ; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> ; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 ; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]] +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP6]] ; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 ; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -17,23 +17,17 @@ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SUB86_1]], i32 4 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[ADD78_2]], i32 5 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD78_1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD94_1]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_1]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[ADD78_2]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SUB102_3]], i32 5 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = lshr <16 x i32> [[TMP14]], -; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i32> [[TMP15]], -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw <16 x i32> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = add <16 x i32> [[TMP17]], [[TMP14]] -; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i32> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP19]]) -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP20]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = lshr <16 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <16 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]]) +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP15]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -34,11 +34,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -70,11 +70,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -106,11 +106,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -144,11 +144,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -180,11 +180,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -217,11 +217,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -260,26 +260,26 @@ ; CHECK: for.body3: ; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 4 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP9]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] ; CHECK-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i64 3 ; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] ; CHECK-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll @@ -15,10 +15,10 @@ ; CHECK: while.body.lr.ph: ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 ; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP3]], i1 [[ICMP_A1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> [[TMP2]], i1 [[ICMP_A1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP0]] ; CHECK-NEXT: br label [[WHILE_END]] ; CHECK: while.end: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5]], [[WHILE_BODY_LR_PH]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll @@ -13,13 +13,13 @@ ; CHECK: if.end: ; CHECK-NEXT: [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef ; CHECK-NEXT: [[SHR15:%.*]] = ashr i32 [[SUB14]], 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[SHR15]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[SUB14]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP0]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], poison -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> poison +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], undef +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> undef ; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[TMP5]] to <4 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -244,27 +244,20 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP9]] = fmul <4 x float> [[TMP8]], +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP2:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2]] = fmul <4 x float> [[TMP1]], ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP13]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP6]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -69,10 +69,10 @@ ; SSE-LABEL: @pr35497( ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 @@ -95,10 +95,10 @@ ; AVX-LABEL: @pr35497( ; AVX-NEXT: entry: ; AVX-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef ; AVX-NEXT: store i64 [[ADD]], i64* undef, align 1 ; AVX-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -145,11 +145,11 @@ ; ; AVX-LABEL: @store_i64( ; AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]] +; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX-NEXT: [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP7]], [[SHUFFLE]] ; AVX-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], ; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> ; AVX-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -26,9 +26,9 @@ ; AVX-LABEL: @foo( ; AVX-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 ; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; CHECK-LABEL: @gather_load( @@ -22,6 +22,85 @@ ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void +; +; SSE-LABEL: @gather_load( +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; SSE-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: ret void +; +; AVX-LABEL: @gather_load( +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: ret void +; +; AVX2-LABEL: @gather_load( +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX2-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: ret void +; +; AVX512F-LABEL: @gather_load( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX512F-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX512F-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX512F-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP6]], <2 x i32*> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> [[TMP8]], <4 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP9]], i32* [[TMP3]], i64 3 +; AVX512VL-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], +; AVX512VL-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %1, align 4, !tbaa !2 @@ -254,65 +333,43 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32*> [[TMP9]], <8 x i32*> [[TMP10]], <8 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32*> [[TMP11]], <8 x i32*> [[TMP12]], <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x i32*> [[TMP13]], i32* [[TMP8]], i64 7 +; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP14]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP16:%.*]] = add <8 x i32> [[TMP15]], +; AVX512F-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP16]], <8 x i32>* [[TMP17]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 -; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 -; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4 -; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32*> [[TMP9]], <8 x i32*> [[TMP10]], <8 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32*> [[TMP11]], <8 x i32*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x i32*> [[TMP13]], i32* [[TMP8]], i64 7 +; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP14]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP16:%.*]] = add <8 x i32> [[TMP15]], +; AVX512VL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP16]], <8 x i32>* [[TMP17]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -457,65 +514,43 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 -; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <8 x i32*> poison, i32* [[T1]], i64 0 +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32*> [[TMP6]], <8 x i32*> [[TMP7]], <8 x i32> +; AVX512F-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32*> [[TMP8]], <8 x i32*> [[TMP9]], <8 x i32> +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <8 x i32*> [[TMP10]], i32* [[T30]], i64 7 +; AVX512F-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP11]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP13:%.*]] = add <8 x i32> [[TMP12]], +; AVX512F-NEXT: [[TMP14:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP13]], <8 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], -; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <8 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32*> [[TMP6]], <8 x i32*> [[TMP7]], <8 x i32> +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32*> [[TMP8]], <8 x i32*> [[TMP9]], <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x i32*> [[TMP10]], i32* [[T30]], i64 7 +; AVX512VL-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP11]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP13:%.*]] = add <8 x i32> [[TMP12]], +; AVX512VL-NEXT: [[TMP14:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP13]], <8 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 @@ -567,211 +602,214 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; SSE-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; SSE-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]] -; SSE-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; SSE-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; SSE-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; SSE-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2 -; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3 -; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] -; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; SSE-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; SSE-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP8]], <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; SSE-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP13]], <4 x i32> +; SSE-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x i32> +; SSE-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP17]], <4 x i32> +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP21]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP23]], <4 x float>* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; SSE-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; SSE-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP35:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP38]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP31]], i64 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP35]], i64 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP45:%.*]] = load float, float* [[TMP44]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i64 3 +; SSE-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP49]] +; SSE-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP24]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP50]], <4 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP13]], <8 x i32> +; AVX-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP15]], <8 x i32> +; AVX-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP18:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP17]], <8 x i32> +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 4 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP26]], i64 5 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP30]], i64 6 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP34]], i64 7 +; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP18]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP20]], i64 3 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP24]], i64 4 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP28]], i64 5 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP32]], i64 6 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP42]], i64 7 +; AVX-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP16]], <8 x float> [[TMP47]], <8 x i32> +; AVX-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP40]], [[TMP48]] +; AVX-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX2-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX2-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX2-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP13]], <8 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP15]], <8 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP17]], <8 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 4 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP26]], i64 5 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP30]], i64 6 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP34]], i64 7 +; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP18]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP20]], i64 3 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP24]], i64 4 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP28]], i64 5 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP32]], i64 6 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP42]], i64 7 +; AVX2-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP16]], <8 x float> [[TMP47]], <8 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP40]], [[TMP48]] +; AVX2-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX2-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 13 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> ; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <16 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP8]], <4 x float*> poison, <16 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <16 x float*> [[TMP11]], <16 x float*> [[TMP12]], <16 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <16 x i32> +; AVX512F-NEXT: [[TMP15:%.*]] = shufflevector <16 x float*> [[TMP13]], <16 x float*> [[TMP14]], <16 x i32> +; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <16 x float*> [[TMP15]], float* [[TMP3]], i64 7 +; AVX512F-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <16 x i32> +; AVX512F-NEXT: [[TMP18:%.*]] = shufflevector <16 x float*> [[TMP16]], <16 x float*> [[TMP17]], <16 x i32> +; AVX512F-NEXT: [[TMP19:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP20:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP21]], [[TMP20]] +; AVX512F-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 13 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> ; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <16 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP8]], <4 x float*> poison, <16 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <16 x float*> [[TMP11]], <16 x float*> [[TMP12]], <16 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <16 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <16 x float*> [[TMP13]], <16 x float*> [[TMP14]], <16 x i32> +; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <16 x float*> [[TMP15]], float* [[TMP3]], i64 7 +; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <16 x i32> +; AVX512VL-NEXT: [[TMP18:%.*]] = shufflevector <16 x float*> [[TMP16]], <16 x float*> [[TMP17]], <16 x i32> +; AVX512VL-NEXT: [[TMP19:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP20:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP21]], [[TMP20]] +; AVX512VL-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; CHECK-LABEL: @gather_load( @@ -22,6 +22,85 @@ ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void +; +; SSE-LABEL: @gather_load( +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; SSE-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: ret void +; +; AVX-LABEL: @gather_load( +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: ret void +; +; AVX2-LABEL: @gather_load( +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX2-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: ret void +; +; AVX512F-LABEL: @gather_load( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 1 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX512F-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX512F-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX512F-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP6]], <2 x i32*> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> [[TMP8]], <4 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> [[TMP9]], i32* [[TMP3]], i64 3 +; AVX512VL-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], +; AVX512VL-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %1, align 4, !tbaa !2 @@ -254,65 +333,43 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32*> [[TMP9]], <8 x i32*> [[TMP10]], <8 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32*> [[TMP11]], <8 x i32*> [[TMP12]], <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x i32*> [[TMP13]], i32* [[TMP8]], i64 7 +; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP14]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP16:%.*]] = add <8 x i32> [[TMP15]], +; AVX512F-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP16]], <8 x i32>* [[TMP17]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 -; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 -; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4 -; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32*> [[TMP9]], <8 x i32*> [[TMP10]], <8 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32*> [[TMP11]], <8 x i32*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x i32*> [[TMP13]], i32* [[TMP8]], i64 7 +; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP14]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP16:%.*]] = add <8 x i32> [[TMP15]], +; AVX512VL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP16]], <8 x i32>* [[TMP17]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -457,65 +514,43 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 -; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <8 x i32*> poison, i32* [[T1]], i64 0 +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32*> [[TMP6]], <8 x i32*> [[TMP7]], <8 x i32> +; AVX512F-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32*> [[TMP8]], <8 x i32*> [[TMP9]], <8 x i32> +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <8 x i32*> [[TMP10]], i32* [[T30]], i64 7 +; AVX512F-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP11]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP13:%.*]] = add <8 x i32> [[TMP12]], +; AVX512F-NEXT: [[TMP14:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP13]], <8 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], -; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <8 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32*> [[TMP6]], <8 x i32*> [[TMP7]], <8 x i32> +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32*> [[TMP8]], <8 x i32*> [[TMP9]], <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x i32*> [[TMP10]], i32* [[T30]], i64 7 +; AVX512VL-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP11]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP13:%.*]] = add <8 x i32> [[TMP12]], +; AVX512VL-NEXT: [[TMP14:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP13]], <8 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 @@ -567,211 +602,214 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; SSE-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; SSE-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]] -; SSE-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; SSE-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; SSE-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; SSE-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2 -; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3 -; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] -; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; SSE-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; SSE-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP8]], <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; SSE-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP13]], <4 x i32> +; SSE-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x i32> +; SSE-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP17]], <4 x i32> +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP21]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP23]], <4 x float>* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; SSE-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; SSE-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP35:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP38]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP31]], i64 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP35]], i64 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP45:%.*]] = load float, float* [[TMP44]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i64 3 +; SSE-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP49]] +; SSE-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP24]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP50]], <4 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP13]], <8 x i32> +; AVX-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP15]], <8 x i32> +; AVX-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP18:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP17]], <8 x i32> +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 4 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP26]], i64 5 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP30]], i64 6 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP34]], i64 7 +; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP18]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP20]], i64 3 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP24]], i64 4 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP28]], i64 5 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP32]], i64 6 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP42]], i64 7 +; AVX-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP16]], <8 x float> [[TMP47]], <8 x i32> +; AVX-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP40]], [[TMP48]] +; AVX-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX2-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX2-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX2-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP13]], <8 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP15]], <8 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> [[TMP17]], <8 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 4 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP26]], i64 5 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP30]], i64 6 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP34]], i64 7 +; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP18]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP20]], i64 3 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP24]], i64 4 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP28]], i64 5 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP32]], i64 6 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP42]], i64 7 +; AVX2-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP16]], <8 x float> [[TMP47]], <8 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP40]], [[TMP48]] +; AVX2-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX2-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 13 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> ; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <16 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP8]], <4 x float*> poison, <16 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <16 x float*> [[TMP11]], <16 x float*> [[TMP12]], <16 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <16 x i32> +; AVX512F-NEXT: [[TMP15:%.*]] = shufflevector <16 x float*> [[TMP13]], <16 x float*> [[TMP14]], <16 x i32> +; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <16 x float*> [[TMP15]], float* [[TMP3]], i64 7 +; AVX512F-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <16 x i32> +; AVX512F-NEXT: [[TMP18:%.*]] = shufflevector <16 x float*> [[TMP16]], <16 x float*> [[TMP17]], <16 x i32> +; AVX512F-NEXT: [[TMP19:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP20:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP21]], [[TMP20]] +; AVX512F-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 13 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> ; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <16 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP8]], <4 x float*> poison, <16 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <16 x float*> [[TMP11]], <16 x float*> [[TMP12]], <16 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <16 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <16 x float*> [[TMP13]], <16 x float*> [[TMP14]], <16 x i32> +; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <16 x float*> [[TMP15]], float* [[TMP3]], i64 7 +; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <16 x i32> +; AVX512VL-NEXT: [[TMP18:%.*]] = shufflevector <16 x float*> [[TMP16]], <16 x float*> [[TMP17]], <16 x i32> +; AVX512VL-NEXT: [[TMP19:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP20:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP21]], [[TMP20]] +; AVX512VL-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; ; AVX512-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -100,14 +100,13 @@ ; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 ; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 ; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X3]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X1]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = icmp slt <2 x i32> [[TMP2]], zeroinitializer -; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false ; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP5]], i1 false +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false ; SSE-NEXT: ret i1 [[S3]] ; ; AVX-LABEL: @logical_and_icmp_diff_preds( @@ -354,19 +353,16 @@ ; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 ; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y3]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = freeze <8 x i1> [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP10]]) -; CHECK-NEXT: ret i1 [[TMP11]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[Y1]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[Y2]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y3]], i32 7 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -87,15 +87,15 @@ ; CHECK-LABEL: @fcmp_lt_gt( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[MUL]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 @@ -136,15 +136,15 @@ define i1 @fcmp_lt(double %a, double %b, double %c) { ; CHECK-LABEL: @fcmp_lt( ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[MUL]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[B]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = fcmp uge <2 x double> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -6,6 +6,7 @@ ; CHECK-LABEL: @fextr( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <8 x i16>, <8 x i16>* undef, align 16 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: br label [[T:%.*]] ; CHECK: t: ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 0 @@ -16,12 +17,9 @@ ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 5 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 6 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[LD]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[P0]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[LD]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP1]], <8 x i16>* [[TMP2]], align 2 ; CHECK-NEXT: ret void ; ; YAML: Pass: slp-vectorizer @@ -29,7 +27,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-20' +; YAML-NEXT: - Cost: '-22' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -26,13 +26,13 @@ ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP1]], 3 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP1]], 7 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP1]], 7 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 ; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 @@ -108,7 +108,7 @@ ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-5' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '4' + ; YAML-NEXT: - TreeSize: '5' ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -14,26 +14,19 @@ ; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]] ; CHECK-NEXT: [[TMP9:%.*]] = shl nsw <4 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i32> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP21]] -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> [[TMP26]], <4 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP27]], <4 x i32>* [[TMP29]], align 16 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <4 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 16 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds i8, i8* undef, i64 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -10,32 +10,34 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18]] = fadd float [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP19]] = fadd float [[TMP2]], [[TMP17]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP8]] to <2 x float>* +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP7]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd <2 x float> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = fsub <2 x float> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[TMP19]] = fadd <2 x float> [[TMP2]], [[TMP18]] ; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]] ; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 -; CHECK-NEXT: store float [[TMP18]], float* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1 -; CHECK-NEXT: store float [[TMP19]], float* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[TMP19]], i32 1 +; CHECK-NEXT: store float [[TMP23]], float* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[TMP19]], i32 0 +; CHECK-NEXT: store float [[TMP25]], float* [[TMP24]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,7 +11,11 @@ ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 @@ -19,27 +23,23 @@ ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[SHUFFLE1]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[SHUFFLE1]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP1]], ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_12_I_I]], i32 13 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <16 x i32> @@ -47,7 +47,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> ; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* ; CHECK-NEXT: store <16 x i8> [[TMP17]], <16 x i8>* [[TMP18]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/return.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll @@ -44,12 +44,12 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[X]] to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 ; CHECK-NEXT: [[ADD5:%.*]] = fadd double [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll @@ -6,7 +6,7 @@ ; CHECK-NEXT: for.cond.preheader: ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_INC_PREHEADER:%.*]] ; CHECK: for.inc.preheader: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[TMP0:%.*]], i32 6 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[TMP0:%.*]], i32 6 ; CHECK-NEXT: br i1 false, label [[FOR_END]], label [[L1_PREHEADER:%.*]] ; CHECK: for.end: ; CHECK-NEXT: [[DOTPR:%.*]] = phi i32 [ 0, [[FOR_INC_PREHEADER]] ], [ 0, [[FOR_COND_PREHEADER:%.*]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -25,13 +25,13 @@ ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP1]], 3 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP1]], 7 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP1]], 7 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 ; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -8,19 +8,19 @@ ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 -; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5 -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], poison -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> poison, <4 x i32> [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> poison, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], undef +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> undef, <4 x i32> [[SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] ; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll @@ -30,7 +30,7 @@ ; CHECK: sw.epilog: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ] ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> poison, [[SHUFFLE]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> undef, [[SHUFFLE]] ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll @@ -4,37 +4,30 @@ define void @foo(i32* noalias nocapture writeonly %B, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %C, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[N:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; CHECK-NEXT: [[MUL2:%.*]] = mul nsw i32 [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL2]], [[MUL]] -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[MUL4:%.*]] = mul nsw i32 [[ADD]], [[TMP2]] -; CHECK-NEXT: store i32 [[MUL4]], i32* [[B:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], [[M]] -; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[MUL9]], [[MUL]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[MUL12:%.*]] = mul nsw i32 [[ADD10]], [[TMP4]] -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; CHECK-NEXT: store i32 [[MUL12]], i32* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP2]], [[N]] -; CHECK-NEXT: [[MUL17:%.*]] = mul nsw i32 [[TMP4]], [[M]] -; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL15]] -; CHECK-NEXT: [[MUL20:%.*]] = mul nsw i32 [[ADD18]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[N:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[N]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP5]], [[M]] -; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL25]], [[MUL15]] -; CHECK-NEXT: [[MUL28:%.*]] = mul nsw i32 [[ADD26]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[M:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP11]], [[TMP8]] ; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: store i32 [[MUL28]], i32* [[ARRAYIDX29]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -66,10 +66,10 @@ ; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -28,22 +28,30 @@ ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14 ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32*> poison, i32* [[ARRAYIDX1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32*> [[TMP2]], i32* [[ARRAYIDX6]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32*> [[TMP3]], i32* [[ARRAYIDX13]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32*> [[TMP4]], i32* [[ARRAYIDX20]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32*> [[TMP5]], i32* [[ARRAYIDX27]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32*> [[TMP6]], i32* [[ARRAYIDX34]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32*> [[TMP7]], i32* [[ARRAYIDX41]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef) -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32*> poison, i32* [[ARRAYIDX]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32*> [[TMP0]], i32* [[ARRAYIDX4]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32*> [[TMP1]], i32* [[ARRAYIDX11]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32*> [[TMP2]], i32* [[ARRAYIDX18]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32*> [[TMP3]], i32* [[ARRAYIDX25]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32*> [[TMP4]], i32* [[ARRAYIDX32]], i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32*> [[TMP5]], i32* [[ARRAYIDX39]], i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32*> [[TMP6]], i32* [[ARRAYIDX46]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32*> [[TMP7]], i32* [[ARRAYIDX20]], i32 8 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 9 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32*> [[TMP9]], i32* [[ARRAYIDX13]], i32 10 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32*> [[TMP10]], i32* [[ARRAYIDX6]], i32 11 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32*> [[TMP11]], i32* [[ARRAYIDX27]], i32 12 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32*> [[TMP12]], i32* [[ARRAYIDX34]], i32 13 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32*> [[TMP13]], i32* [[ARRAYIDX41]], i32 14 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32*> [[TMP14]], i32* [[ARRAYIDX1]], i32 15 +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add nsw <8 x i32> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP19]], <8 x i32>* [[TMP20]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -136,13 +144,10 @@ ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP7]], <8 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,24 +210,22 @@ ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <8 x i32> ; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[G20]] to <2 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <8 x i32> ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[G22]] to <2 x i32>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[G22]] to <2 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP11]], <8 x i32> ; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP13]], <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP12]], <8 x i32>* [[TMP13]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -9,26 +9,26 @@ ; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 ; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_4]], i32 1 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 ; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[LOAD_7]], i32 1 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP4]], [[TMP8]] ; CHECK-NEXT: br label [[BLOCK1:%.*]] ; CHECK: block1: ; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 5 @@ -42,7 +42,7 @@ ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4 ; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP9]], <2 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -15,20 +15,18 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x i32> +; ENABLED-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP6]] +; ENABLED-NEXT: [[TMP10:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -74,20 +72,18 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x i32> +; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP3]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP6]] +; ENABLED-NEXT: [[TMP10:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -216,19 +212,19 @@ ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 ; ENABLED-NEXT: [[C:%.*]] = load double, double* [[IDXC]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; ENABLED-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; ENABLED-NEXT: [[TMP3:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 +; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP0]], <2 x i32> +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP5]] ; ENABLED-NEXT: [[D:%.*]] = load double, double* [[IDXD]], align 8 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D]], i32 1 +; ENABLED-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP7]], <2 x i32> +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP8]] +; ENABLED-NEXT: [[TMP10:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -168,17 +168,17 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] @@ -314,6 +314,8 @@ ; CHECK-LABEL: @tiny_vector_with_diff_opcode( ; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 undef to i16 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i16> [[TMP3]], i16 [[TMP2]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 2 @@ -322,9 +324,7 @@ ; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 5 ; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 6 ; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP2]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[PTR0]] to <8 x i16>* ; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP5]], align 16 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll @@ -28,101 +28,101 @@ ; CHECK-NEXT: [[B_0:%.*]] = phi i32 [ [[SPEC_SELECT8_3_7:%.*]], [[FOR_COND]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], -183 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = sub <4 x i32> [[SHUFFLE]], [[TMP0]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP20]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP21]], [[B_0]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP21]], i32 [[B_0]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[SHUFFLE]], [[TMP0]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> [[TMP18]] +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP21]]) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP22]], [[B_0]] +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP22]], i32 [[B_0]] ; CHECK-NEXT: [[SUB_116:%.*]] = sub i32 [[TMP15]], [[TMP1]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp slt i32 [[SUB_116]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = icmp slt i32 [[SUB_116]], 0 ; CHECK-NEXT: [[NEG_117:%.*]] = sub nsw i32 0, [[SUB_116]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[NEG_117]], i32 [[SUB_116]] -; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP23]], [[OP_EXTRA1]] -; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP23]], i32 [[OP_EXTRA1]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[NEG_117]], i32 [[SUB_116]] +; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP24]], [[OP_EXTRA1]] +; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP24]], i32 [[OP_EXTRA1]] ; CHECK-NEXT: [[SUB_1_1:%.*]] = sub i32 [[TMP15]], [[TMP2]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i32 [[SUB_1_1]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = icmp slt i32 [[SUB_1_1]], 0 ; CHECK-NEXT: [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[NEG_1_1]], i32 [[SUB_1_1]] -; CHECK-NEXT: [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP25]], [[SPEC_SELECT8_120]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[NEG_1_1]], i32 [[SUB_1_1]] +; CHECK-NEXT: [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP26]], [[SPEC_SELECT8_120]] ; CHECK-NEXT: [[NARROW:%.*]] = or i1 [[CMP12_1_1]], [[CMP12_118]] -; CHECK-NEXT: [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP25]], i32 [[SPEC_SELECT8_120]] +; CHECK-NEXT: [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP26]], i32 [[SPEC_SELECT8_120]] ; CHECK-NEXT: [[SUB_2_1:%.*]] = sub i32 [[TMP15]], [[TMP3]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp slt i32 [[SUB_2_1]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = icmp slt i32 [[SUB_2_1]], 0 ; CHECK-NEXT: [[NEG_2_1:%.*]] = sub nsw i32 0, [[SUB_2_1]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[NEG_2_1]], i32 [[SUB_2_1]] -; CHECK-NEXT: [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP27]], [[SPEC_SELECT8_1_1]] +; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[NEG_2_1]], i32 [[SUB_2_1]] +; CHECK-NEXT: [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP28]], [[SPEC_SELECT8_1_1]] ; CHECK-NEXT: [[NARROW34:%.*]] = or i1 [[CMP12_2_1]], [[NARROW]] -; CHECK-NEXT: [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP27]], i32 [[SPEC_SELECT8_1_1]] +; CHECK-NEXT: [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP28]], i32 [[SPEC_SELECT8_1_1]] ; CHECK-NEXT: [[SUB_3_1:%.*]] = sub i32 [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp slt i32 [[SUB_3_1]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = icmp slt i32 [[SUB_3_1]], 0 ; CHECK-NEXT: [[NEG_3_1:%.*]] = sub nsw i32 0, [[SUB_3_1]] -; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[NEG_3_1]], i32 [[SUB_3_1]] -; CHECK-NEXT: [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP29]], [[SPEC_SELECT8_2_1]] +; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[NEG_3_1]], i32 [[SUB_3_1]] +; CHECK-NEXT: [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP30]], [[SPEC_SELECT8_2_1]] ; CHECK-NEXT: [[NARROW35:%.*]] = or i1 [[CMP12_3_1]], [[NARROW34]] ; CHECK-NEXT: [[SPEC_SELECT_3_1:%.*]] = zext i1 [[NARROW35]] to i32 -; CHECK-NEXT: [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP29]], i32 [[SPEC_SELECT8_2_1]] +; CHECK-NEXT: [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP30]], i32 [[SPEC_SELECT8_2_1]] ; CHECK-NEXT: [[SUB_222:%.*]] = sub i32 [[TMP15]], [[TMP5]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp slt i32 [[SUB_222]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = icmp slt i32 [[SUB_222]], 0 ; CHECK-NEXT: [[NEG_223:%.*]] = sub nsw i32 0, [[SUB_222]] -; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[NEG_223]], i32 [[SUB_222]] -; CHECK-NEXT: [[CMP12_224:%.*]] = icmp slt i32 [[TMP31]], [[SPEC_SELECT8_3_1]] -; CHECK-NEXT: [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP31]], i32 [[SPEC_SELECT8_3_1]] +; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[NEG_223]], i32 [[SUB_222]] +; CHECK-NEXT: [[CMP12_224:%.*]] = icmp slt i32 [[TMP32]], [[SPEC_SELECT8_3_1]] +; CHECK-NEXT: [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP32]], i32 [[SPEC_SELECT8_3_1]] ; CHECK-NEXT: [[SUB_1_2:%.*]] = sub i32 [[TMP15]], [[TMP6]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp slt i32 [[SUB_1_2]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = icmp slt i32 [[SUB_1_2]], 0 ; CHECK-NEXT: [[NEG_1_2:%.*]] = sub nsw i32 0, [[SUB_1_2]] -; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[NEG_1_2]], i32 [[SUB_1_2]] -; CHECK-NEXT: [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP33]], [[SPEC_SELECT8_226]] -; CHECK-NEXT: [[TMP34:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]] -; CHECK-NEXT: [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP33]], i32 [[SPEC_SELECT8_226]] +; CHECK-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[NEG_1_2]], i32 [[SUB_1_2]] +; CHECK-NEXT: [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP34]], [[SPEC_SELECT8_226]] +; CHECK-NEXT: [[TMP35:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]] +; CHECK-NEXT: [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP34]], i32 [[SPEC_SELECT8_226]] ; CHECK-NEXT: [[SUB_2_2:%.*]] = sub i32 [[TMP15]], [[TMP7]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp slt i32 [[SUB_2_2]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = icmp slt i32 [[SUB_2_2]], 0 ; CHECK-NEXT: [[NEG_2_2:%.*]] = sub nsw i32 0, [[SUB_2_2]] -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[NEG_2_2]], i32 [[SUB_2_2]] -; CHECK-NEXT: [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP36]], [[SPEC_SELECT8_1_2]] -; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[CMP12_2_2]], [[TMP34]] -; CHECK-NEXT: [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP36]], i32 [[SPEC_SELECT8_1_2]] +; CHECK-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[NEG_2_2]], i32 [[SUB_2_2]] +; CHECK-NEXT: [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP37]], [[SPEC_SELECT8_1_2]] +; CHECK-NEXT: [[TMP38:%.*]] = or i1 [[CMP12_2_2]], [[TMP35]] +; CHECK-NEXT: [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP37]], i32 [[SPEC_SELECT8_1_2]] ; CHECK-NEXT: [[SUB_3_2:%.*]] = sub i32 [[TMP15]], [[TMP8]] -; CHECK-NEXT: [[TMP38:%.*]] = icmp slt i32 [[SUB_3_2]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = icmp slt i32 [[SUB_3_2]], 0 ; CHECK-NEXT: [[NEG_3_2:%.*]] = sub nsw i32 0, [[SUB_3_2]] -; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[NEG_3_2]], i32 [[SUB_3_2]] -; CHECK-NEXT: [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP39]], [[SPEC_SELECT8_2_2]] -; CHECK-NEXT: [[TMP40:%.*]] = or i1 [[CMP12_3_2]], [[TMP37]] -; CHECK-NEXT: [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP40]], i32 2, i32 [[SPEC_SELECT_3_1]] -; CHECK-NEXT: [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP39]], i32 [[SPEC_SELECT8_2_2]] +; CHECK-NEXT: [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[NEG_3_2]], i32 [[SUB_3_2]] +; CHECK-NEXT: [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP40]], [[SPEC_SELECT8_2_2]] +; CHECK-NEXT: [[TMP41:%.*]] = or i1 [[CMP12_3_2]], [[TMP38]] +; CHECK-NEXT: [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP41]], i32 2, i32 [[SPEC_SELECT_3_1]] +; CHECK-NEXT: [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP40]], i32 [[SPEC_SELECT8_2_2]] ; CHECK-NEXT: [[SUB_328:%.*]] = sub i32 [[TMP15]], [[TMP9]] -; CHECK-NEXT: [[TMP41:%.*]] = icmp slt i32 [[SUB_328]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = icmp slt i32 [[SUB_328]], 0 ; CHECK-NEXT: [[NEG_329:%.*]] = sub nsw i32 0, [[SUB_328]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[NEG_329]], i32 [[SUB_328]] -; CHECK-NEXT: [[CMP12_330:%.*]] = icmp slt i32 [[TMP42]], [[SPEC_SELECT8_3_2]] -; CHECK-NEXT: [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP42]], i32 [[SPEC_SELECT8_3_2]] +; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[NEG_329]], i32 [[SUB_328]] +; CHECK-NEXT: [[CMP12_330:%.*]] = icmp slt i32 [[TMP43]], [[SPEC_SELECT8_3_2]] +; CHECK-NEXT: [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP43]], i32 [[SPEC_SELECT8_3_2]] ; CHECK-NEXT: [[SUB_1_3:%.*]] = sub i32 [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP43:%.*]] = icmp slt i32 [[SUB_1_3]], 0 +; CHECK-NEXT: [[TMP44:%.*]] = icmp slt i32 [[SUB_1_3]], 0 ; CHECK-NEXT: [[NEG_1_3:%.*]] = sub nsw i32 0, [[SUB_1_3]] -; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[NEG_1_3]], i32 [[SUB_1_3]] -; CHECK-NEXT: [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP44]], [[SPEC_SELECT8_332]] -; CHECK-NEXT: [[TMP45:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]] -; CHECK-NEXT: [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP44]], i32 [[SPEC_SELECT8_332]] +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[NEG_1_3]], i32 [[SUB_1_3]] +; CHECK-NEXT: [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP45]], [[SPEC_SELECT8_332]] +; CHECK-NEXT: [[TMP46:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]] +; CHECK-NEXT: [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP45]], i32 [[SPEC_SELECT8_332]] ; CHECK-NEXT: [[SUB_2_3:%.*]] = sub i32 [[TMP15]], [[TMP11]] -; CHECK-NEXT: [[TMP46:%.*]] = icmp slt i32 [[SUB_2_3]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = icmp slt i32 [[SUB_2_3]], 0 ; CHECK-NEXT: [[NEG_2_3:%.*]] = sub nsw i32 0, [[SUB_2_3]] -; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[NEG_2_3]], i32 [[SUB_2_3]] -; CHECK-NEXT: [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP47]], [[SPEC_SELECT8_1_3]] -; CHECK-NEXT: [[TMP48:%.*]] = or i1 [[CMP12_2_3]], [[TMP45]] -; CHECK-NEXT: [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP47]], i32 [[SPEC_SELECT8_1_3]] +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[NEG_2_3]], i32 [[SUB_2_3]] +; CHECK-NEXT: [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP48]], [[SPEC_SELECT8_1_3]] +; CHECK-NEXT: [[TMP49:%.*]] = or i1 [[CMP12_2_3]], [[TMP46]] +; CHECK-NEXT: [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP48]], i32 [[SPEC_SELECT8_1_3]] ; CHECK-NEXT: [[SUB_3_3:%.*]] = sub i32 [[TMP15]], [[TMP12]] -; CHECK-NEXT: [[TMP49:%.*]] = icmp slt i32 [[SUB_3_3]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = icmp slt i32 [[SUB_3_3]], 0 ; CHECK-NEXT: [[NEG_3_3:%.*]] = sub nsw i32 0, [[SUB_3_3]] -; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[NEG_3_3]], i32 [[SUB_3_3]] -; CHECK-NEXT: [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP50]], [[SPEC_SELECT8_2_3]] -; CHECK-NEXT: [[TMP51:%.*]] = or i1 [[CMP12_3_3]], [[TMP48]] -; CHECK-NEXT: [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP51]], i32 3, i32 [[SPEC_SELECT_3_2]] -; CHECK-NEXT: [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP50]], i32 [[SPEC_SELECT8_2_3]] -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i32> poison, i32 [[TMP15]], i32 0 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[NEG_3_3]], i32 [[SUB_3_3]] +; CHECK-NEXT: [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP51]], [[SPEC_SELECT8_2_3]] +; CHECK-NEXT: [[TMP52:%.*]] = or i1 [[CMP12_3_3]], [[TMP49]] +; CHECK-NEXT: [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP52]], i32 3, i32 [[SPEC_SELECT_3_2]] +; CHECK-NEXT: [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP51]], i32 [[SPEC_SELECT8_2_3]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sub <16 x i32> [[SHUFFLE2]], [[TMP13]] ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <16 x i32> [[TMP53]], i32 0 ; CHECK-NEXT: [[NEG_4:%.*]] = sub nsw i32 0, [[TMP54]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -35,14 +35,14 @@ ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] ; CHECK-NEXT: [[T40:%.*]] = mul nsw i32 [[T39]], 9633 -; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 -; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 -; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 -; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 +; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 +; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 +; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -35,14 +35,14 @@ ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] ; CHECK-NEXT: [[T40:%.*]] = mul nsw i32 [[T39]], 9633 -; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 -; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 -; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 -; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 +; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 +; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 +; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll @@ -17,9 +17,9 @@ ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1 ; CHECK-NEXT: [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2 ; CHECK-NEXT: [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = sitofp <4 x i32> [[TMP6]] to <4 x float> -; CHECK-NEXT: [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], poison +; CHECK-NEXT: [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], undef ; CHECK-NEXT: [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ADD_PTR53]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -6,34 +6,28 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = uitofp i16 undef to float ; CHECK-NEXT: [[SUB:%.*]] = fsub float 6.553500e+04, undef -; CHECK-NEXT: br label [[BB1:%.*]] -; CHECK: bb1: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[SUB]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP18:%.*]], [[BB3:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: ; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double +; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]] ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float> -; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]] +; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> poison, double [[SUB1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP18]] = phi <4 x float> [ [[TMP17]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] +; CHECK-NEXT: [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -133,26 +133,26 @@ ; MAX256-NEXT: br label [[BB1:%.*]] ; MAX256: bb1: ; MAX256-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float +; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 ; MAX256-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 ; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 ; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 -; MAX256-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 -; MAX256-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX256-NEXT: [[SHUFFLE6:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 +; MAX256-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE6]], [[SHUFFLE7]] ; MAX256-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX256-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP7:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE7]] +; MAX256-NEXT: [[TMP8:%.*]] = fadd <8 x float> zeroinitializer, [[TMP7]] +; MAX256-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[SHUFFLE2]], [[SHUFFLE7]] +; MAX256-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP9]] +; MAX256-NEXT: [[SHUFFLE4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE4]], [[SHUFFLE7]] ; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] ; MAX256-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX256-NEXT: i32 0, label [[BB2:%.*]] @@ -166,10 +166,10 @@ ; MAX256: bb5: ; MAX256-NEXT: br label [[BB2]] ; MAX256: bb2: -; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP8]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] +; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] +; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] +; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[TMP6]], [[BB4]] ], [ [[TMP6]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] ; MAX256-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX256-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX256-NEXT: ret void @@ -179,26 +179,26 @@ ; MAX1024-NEXT: br label [[BB1:%.*]] ; MAX1024: bb1: ; MAX1024-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float +; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 ; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 ; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 ; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 -; MAX1024-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 -; MAX1024-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX1024-NEXT: [[SHUFFLE6:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 +; MAX1024-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE6]], [[SHUFFLE7]] ; MAX1024-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX1024-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP7:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE7]] +; MAX1024-NEXT: [[TMP8:%.*]] = fadd <8 x float> zeroinitializer, [[TMP7]] +; MAX1024-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[SHUFFLE2]], [[SHUFFLE7]] +; MAX1024-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP9]] +; MAX1024-NEXT: [[SHUFFLE4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE4]], [[SHUFFLE7]] ; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] ; MAX1024-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX1024-NEXT: i32 0, label [[BB2:%.*]] @@ -212,10 +212,10 @@ ; MAX1024: bb5: ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: -; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP8]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] +; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] +; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[TMP6]], [[BB4]] ], [ [[TMP6]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] ; MAX1024-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX1024-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX1024-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll @@ -43,8 +43,8 @@ define void @test2() { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> ) -; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> poison, [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77) ; CHECK-NEXT: [[E:%.*]] = icmp ugt i32 [[TMP3]], 1