diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -323,26 +323,6 @@ /// i32 6> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 -/// We convert this initially to something like: -/// %x0 = extractelement <4 x i8> %x, i32 0 -/// %x3 = extractelement <4 x i8> %x, i32 3 -/// %y1 = extractelement <4 x i8> %y, i32 1 -/// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 -/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 -/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 -/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 -/// %5 = mul <4 x i8> %4, %4 -/// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 -/// %7 = extractelement <4 x i8> %5, i32 1 -/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 -/// %8 = extractelement <4 x i8> %5, i32 2 -/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 -/// %9 = extractelement <4 x i8> %5, i32 3 -/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 -/// ret <4 x i8> %ins4 -/// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? @@ -912,6 +892,8 @@ VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); + GatheredLoads.clear(); + GatheredLoadsEntriesFirst = -1; ExternalUses.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); @@ -941,7 +923,9 @@ /// reordered and return the most optimal order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. - Optional getReorderingData(const TreeEntry &TE, bool TopToBottom); + Optional getReorderingData( + const TreeEntry &TE, bool TopToBottom, + DenseMap &ScatterVectorizeToReorder); /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes @@ -2158,12 +2142,36 @@ const APInt &ShuffledIndices, bool NeedToShuffle) const; + /// Check if two tree entries are part of the single path. + bool isPathPart(const TreeEntry *E, const TreeEntry *TE, + DenseMap> + &PathInfo) { + auto &Item = PathInfo.FindAndConstruct(E); + if (Item.second.contains(TE)) + return true; + SmallVector Stack(E->UserTreeIndices.begin(), + E->UserTreeIndices.end()); + SmallPtrSet Visited; + while (!Stack.empty()) { + const EdgeInfo EI = Stack.pop_back_val(); + Item.getSecond().insert(EI.UserTE); + if (EI.UserTE == TE) + return true; + if (!Visited.insert(EI.UserTE).second) + continue; + Stack.append(EI.UserTE->UserTreeIndices.begin(), + EI.UserTE->UserTreeIndices.end()); + } + return false; + } + /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Mask is filled with the shuffle mask. Optional - isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + isGatherShuffledEntry(const TreeEntry *TE, ArrayRef Scalars, + SmallVectorImpl &Mask, SmallVectorImpl &Entries); /// \returns the scalarization cost for this list of values. Assuming that @@ -2182,6 +2190,19 @@ /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(bool ForReduction) const; + /// Run through the list of all gathered loads in the graph and try to find + /// vector loads/masked gathers instead of regular gathers. Later these loads + /// are reshufled to build final gathered nodes. + void tryToVectorizeGatheredLoads(); + + /// Checks if gather node \p E can be represented as a shuffle of vectorized + /// scalars and perform \p Callback transformation/analysis. + /// \returns The list of gathered loads and poison that should be gathered + /// into final vector. + SmallVector + tryToMatchVector(const TreeEntry *E, + function_ref)> Callback); + /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. static void reorderInputsAccordingToOpcode(ArrayRef VL, @@ -2611,6 +2632,13 @@ /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A list of loads to be gathered during the vectorization process. We can + /// try to vectorize them at the end, if profitable. + SmallVector>> GatheredLoads; + + /// The index of the first gathered load entry in the VectorizeTree. + int GatheredLoadsEntriesFirst = -1; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -3423,13 +3451,15 @@ if (!isa(V)) continue; if (const auto *LocalSTE = getTreeEntry(V)) { - if (!STE) + if (!STE) { STE = LocalSTE; - else if (STE != LocalSTE) - // Take the order only from the single vector node. + if (STE->getVectorFactor() != NumScalars) + return None; + } else if (STE != LocalSTE) { + // Take the order only from the single vector node of the same size. return None; - unsigned Lane = - std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); + } + unsigned Lane = STE->findLaneForValue(V); if (Lane >= NumScalars) return None; if (CurrentOrder[Lane] != NumScalars) { @@ -3656,8 +3686,12 @@ return None; } -Optional BoUpSLP::getReorderingData(const TreeEntry &TE, - bool TopToBottom) { +Optional BoUpSLP::getReorderingData( + const TreeEntry &TE, bool TopToBottom, + DenseMap &ScatterVectorizeToReorder) { + if (GatheredLoadsEntriesFirst > 0 && TE.UserTreeIndices.empty() && + &TE != VectorizableTree.front().get()) + return None; // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) @@ -3667,23 +3701,22 @@ (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) return TE.ReorderIndices; - if (TE.State == TreeEntry::NeedToGather) { + bool LoadsScatterVectorize = false; + if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() && + allSameType(TE.Scalars)) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. - if (((TE.getOpcode() == Instruction::ExtractElement && - !TE.isAltShuffle()) || + if ((TE.getOpcode() == Instruction::ExtractElement || (all_of(TE.Scalars, [](Value *V) { return isa(V); }) && any_of(TE.Scalars, [](Value *V) { return isa(V); }))) && - all_of(TE.Scalars, - [](Value *V) { - auto *EE = dyn_cast(V); - return !EE || isa(EE->getVectorOperandType()); - }) && - allSameType(TE.Scalars)) { + all_of(TE.Scalars, [](Value *V) { + auto *EE = dyn_cast(V); + return !EE || isa(EE->getVectorOperandType()); + })) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector. OrdersType CurrentOrder; @@ -3693,9 +3726,28 @@ fixupOrderingIndices(CurrentOrder); return CurrentOrder; } - } - if (Optional CurrentOrder = findReusedOrderedScalars(TE)) + } else if (TE.getOpcode() == Instruction::Load) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState Res = + canVectorizeLoads(TE.Scalars, TE.Scalars.front(), *TTI, *DL, *SE, *LI, + CurrentOrder, PointerOps); + if (Res == LoadsState::Vectorize) + return CurrentOrder; + LoadsScatterVectorize = Res == LoadsState::ScatterVectorize; + } + if (Optional CurrentOrder = findReusedOrderedScalars(TE)) { + if (LoadsScatterVectorize) { + TreeEntry *ScatterVectorTE = getTreeEntry(TE.Scalars.front()); + assert(ScatterVectorTE && "No related ScatterVector node found."); + if (ScatterVectorTE->Idx >= GatheredLoadsEntriesFirst && + ScatterVectorTE->UserTreeIndices.empty()) { + ScatterVectorizeToReorder.try_emplace(&TE, ScatterVectorTE); + return None; + } + } return CurrentOrder; + } if (TE.Scalars.size() >= 4) if (Optional Order = findPartiallyOrderedLoads(TE)) return Order; @@ -3717,6 +3769,9 @@ // Maps a TreeEntry to the reorder indices of external users. DenseMap> ExternalUserReorderMap; + // Nodes with loads masked gathering built out of gathered loads that should + // be reordered to avoid extra shuffles. + DenseMap ScatterVectorizeToReorder; // FIXME: Workaround for syntax error reported by MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; // Find all reorderable nodes with the given VF. @@ -3724,7 +3779,7 @@ // extracts. for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries, &GathersToOrders, &ExternalUserReorderMap, - &AltShufflesToOrders]( + &AltShufflesToOrders, &ScatterVectorizeToReorder]( const std::unique_ptr &TE) { // Look for external users that will probably be vectorized. SmallVector ExternalUserReorderIndices = @@ -3756,8 +3811,8 @@ // TODO: Check the reverse order too. } - if (Optional CurrentOrder = - getReorderingData(*TE, /*TopToBottom=*/true)) { + if (Optional CurrentOrder = getReorderingData( + *TE, /*TopToBottom=*/true, ScatterVectorizeToReorder)) { // Do not include ordering for nodes used in the alt opcode vectorization, // better to reorder them during bottom-to-top stage. If follow the order // here, it causes reordering of the whole graph though actually it is @@ -3870,6 +3925,10 @@ }); // Do an actual reordering, if profitable. for (std::unique_ptr &TE : VectorizableTree) { + // Do not reorder gathered loads. + if (GatheredLoadsEntriesFirst > 0 && TE.get()->UserTreeIndices.empty() && + TE.get() != VectorizableTree.front().get()) + continue; // Just do the reordering for the nodes with the given VF. if (TE->Scalars.size() != VF) { if (TE->ReuseShuffleIndices.size() == VF) { @@ -3977,13 +4036,16 @@ // Currently the are vectorized loads,extracts without alternate operands + // some gathering of extracts. SmallVector NonVectorized; + // Nodes with loads masked gathering built out of gathered loads that should + // be reordered to avoid extra shuffles. + DenseMap ScatterVectorizeToReorder; for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, - &NonVectorized]( + &NonVectorized, &ScatterVectorizeToReorder]( const std::unique_ptr &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - if (Optional CurrentOrder = - getReorderingData(*TE, /*TopToBottom=*/false)) { + if (Optional CurrentOrder = getReorderingData( + *TE, /*TopToBottom=*/false, ScatterVectorizeToReorder)) { OrderedEntries.insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); @@ -4017,6 +4079,8 @@ // search. The graph currently does not provide this dependency directly. for (EdgeInfo &EI : TE->UserTreeIndices) { TreeEntry *UserTE = EI.UserTE; + if (!UserTE) + continue; auto It = Users.find(UserTE); if (It == Users.end()) It = Users.insert({UserTE, {}}).first; @@ -4219,6 +4283,75 @@ if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && VectorizableTree.front()->ReuseShuffleIndices.empty()) VectorizableTree.front()->ReorderIndices.clear(); + // Reorder masked gather nodes built out of gathered loads. + SmallPtrSet Processed; + for (const auto &SVLoadsData : ScatterVectorizeToReorder) { + if (!Processed.insert(SVLoadsData.second).second) + continue; + Optional CurrentOrder = + findReusedOrderedScalars(*SVLoadsData.first); + assert(CurrentOrder && "Expected order."); + if (CurrentOrder->empty() || !SVLoadsData.second->UserTreeIndices.empty()) + continue; + SmallVector Operands; + SmallVector Worklist(1, SVLoadsData.second); + while (!Worklist.empty()) { + TreeEntry *CurrentEntry = Worklist.pop_back_val(); + for (int I = 0, E = CurrentEntry->getNumOperands(); I < E; ++I) { + TreeEntry *Op = getTreeEntry(CurrentEntry->getOperand(I).front()); + if (!Op || !Op->isSame(CurrentEntry->getOperand(I))) { + auto *It = + find_if(VectorizableTree, + [CurrentEntry, I](const std::unique_ptr &TE) { + return TE->State != TreeEntry::Vectorize && + TE->isSame(CurrentEntry->getOperand(I)); + }); + assert(It != VectorizableTree.end() && + "No entry for pointers of ScatterVectorize node."); + Op = It->get(); + } + if (Op->ReuseShuffleIndices.empty()) + Worklist.push_back(Op); + Operands.push_back(Op); + } + } + if (any_of(Operands, [](const TreeEntry *TE) { + return TE->UserTreeIndices.size() != 1 && + (!isSplat(TE->Scalars) || + any_of(TE->Scalars, UndefValue::classof)); + })) + continue; + // Reorder related masked gather node and its operands. + SmallVector Mask(CurrentOrder->size(), UndefMaskElem); + unsigned E = CurrentOrder->size(); + transform(*CurrentOrder, Mask.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : UndefMaskElem; + }); + for (TreeEntry *OpTE : Operands) { + // If there are several users of the pointers tree entry, no need to + // reorder the scatter vectorize node, still have same number of shuffles. + if (!OpTE->ReuseShuffleIndices.empty()) { + reorderReuses(OpTE->ReuseShuffleIndices, Mask); + } else if (OpTE->State == TreeEntry::NeedToGather) { + assert(OpTE->ReorderIndices.empty() && + "Expected no ordering for pointers node."); + reorderScalars(OpTE->Scalars, Mask); + } else { + assert(OpTE->State == TreeEntry::Vectorize && + "Expected Vectorize node"); + assert(OpTE->getNumOperands() == 2 && + "Expected 2 operands for pointers node."); + assert(OpTE->ReorderIndices.empty() && + "Expected no ordering for pointers node."); + OpTE->reorderOperands(Mask); + reorderScalars(OpTE->Scalars, Mask); + } + } + SVLoadsData.second->reorderOperands(Mask); + assert(SVLoadsData.second->ReorderIndices.empty() && + "Expected empty reorder sequence."); + reorderScalars(SVLoadsData.second->Scalars, Mask); + } } void BoUpSLP::buildExternalUses( @@ -4433,6 +4566,14 @@ if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); + // Try to vectorize gathered loads if this is not just a gather of loads. + if (!GatheredLoads.empty() && + !(VectorizableTree.size() == 2 && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.back()->State == TreeEntry::NeedToGather && + VectorizableTree.back()->getOpcode() != Instruction::Load)) + tryToVectorizeGatheredLoads(); } void BoUpSLP::buildTree(ArrayRef Roots) { @@ -4440,6 +4581,179 @@ if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); + // Try to vectorize gathered loads if this is not just a gather of loads. + if (!GatheredLoads.empty() && + !(VectorizableTree.size() == 2 && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.back()->State == TreeEntry::NeedToGather && + VectorizableTree.back()->getOpcode() != Instruction::Load)) + tryToVectorizeGatheredLoads(); +} + +/// Tries to find subvector of loads and builds new vector of only loads if can +/// be profitable. +static void gatherPossiblyVectorizableLoads( + const BoUpSLP &R, ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, + SmallVectorImpl>> &GatheredLoads, + bool AddNew = true) { + for (Value *V : VL) { + auto *LI = dyn_cast(V); + if (!LI) + continue; + if (R.isDeleted(LI) || !isValidElementType(LI->getType())) + continue; + bool IsFound = false; + SmallVectorImpl> *LastCompatibleData = nullptr; + for (auto &Data : GatheredLoads) { + if (LI->getParent() != Data.front().first->getParent()) + continue; + Optional Dist = getPointersDiff( + LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), + Data.front().first->getPointerOperand(), DL, SE, + /*StrictCheck=*/true); + bool NeedToAdd = true; + if (Dist && all_of(Data, [&Dist, LI, &NeedToAdd, AddNew]( + const std::pair &Pair) { + bool WasAdded = AddNew && Pair.first == LI; + NeedToAdd &= !WasAdded; + return Pair.second != *Dist || WasAdded; + })) { + if (NeedToAdd) + Data.emplace_back(LI, *Dist); + IsFound = true; + break; + } + // Find last compatible list loads, they may form masked gather sequence. + if (!AddNew && !Dist && LI->getType() == Data.front().first->getType()) + LastCompatibleData = &Data; + } + if (!IsFound) { + if (AddNew) { + GatheredLoads.emplace_back().emplace_back(LI, 0); + } else if (LastCompatibleData && + none_of(*LastCompatibleData, + [LI](const std::pair &Pair) { + return Pair.first == LI; + })) { + // As a final attempt, just gather all loads in the very last group. + LastCompatibleData->emplace_back(LI, LastCompatibleData->size()); + } + } + } +} + +void BoUpSLP::tryToVectorizeGatheredLoads() { + GatheredLoadsEntriesFirst = VectorizableTree.size(); + + // Sort loads by distance. + auto &&LoadSorter = [](const std::pair &L1, + const std::pair &L2) { + return L1.second > L2.second; + }; + + for (ArrayRef> LoadsDists : GatheredLoads) { + SmallVector> LocalLoadsDists(LoadsDists); + sort(LocalLoadsDists, LoadSorter); + SmallVector Loads(LocalLoadsDists.size()); + transform(LocalLoadsDists, Loads.begin(), + [](const std::pair &L) { return L.first; }); + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + for (int NumElts = PowerOf2Floor(Loads.size()); NumElts > 1; NumElts /= 2) { + for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; + ++Cnt) { + ArrayRef Slice = makeArrayRef(Loads).slice(Cnt, NumElts); + if (VectorizedLoads.count(Slice.front()) || + VectorizedLoads.count(Slice.back())) + continue; + // Check if it is profitable to try vectorizing gathered loads. It is + // profitable if we have more than 4 consecutive loads or if we have + // less but all users are vectorized or deleted. + bool AllowToVectorize = + NumElts >= 4 || + any_of(VectorizableTree, + [Slice](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && + TE->Scalars.size() == 2 && + (equal(TE->Scalars, Slice) || + equal(TE->Scalars, reverse(Slice))); + }); + // Check if it is profitable to vectorize 2-elements loads. + if (NumElts == 2) { + bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad( + Slice.front()->getType(), ElementCount::getFixed(NumElts)); + auto &&CheckIfAllowed = [this, IsLegalBroadcastLoad]( + ArrayRef Slice) { + for (LoadInst *LI : Slice) { + // If single use/user - allow to vectorize. + if (LI->hasOneUse()) + continue; + // 1. Check if number of uses equal number of users. + // 2. All users are deleted. + // 3. The load broadcasts are not allowed or the load is not + // broadcasted. + if (std::distance(LI->user_begin(), LI->user_end()) != + LI->getNumUses()) + return false; + for (User *U : LI->users()) { + if (auto *UI = dyn_cast(U); UI && isDeleted(UI)) + continue; + if (const TreeEntry *UTE = getTreeEntry(U)) { + if (!IsLegalBroadcastLoad) + // The broadcast is illegal - vectorize loads. + continue; + for (int I = 0, End = UTE->getNumOperands(); I < End; ++I) { + if (all_of(UTE->getOperand(I), + [LI](Value *V) { return V == LI; })) + // Found legal broadcast - do not vectorize. + return false; + } + } + continue; + } + } + return true; + }; + AllowToVectorize = CheckIfAllowed(Slice); + } + if (AllowToVectorize) { + SmallVector PointerOps; + OrdersType CurrentOrder; + // Try to build vector load. + ArrayRef Values( + reinterpret_cast(Slice.begin()), Slice.size()); + if (canVectorizeLoads(Values, Slice.front(), *TTI, *DL, *SE, *LI, + CurrentOrder, + PointerOps) != LoadsState::Gather) { + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads (" + << NumElts << ")\n"); + + buildTree_rec(Values, 0, EdgeInfo()); + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += NumElts; + Cnt += NumElts - 1; + continue; + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Loads.size()) + break; + } + } + for (LoadInst *LI : Loads) { + if (!VectorizedLoads.contains(LI)) { + // Reinsert non-vectorized loads to other list of loads with the same + // base pointers. + gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, GatheredLoads, + /*AddNew=*/false); + } + } + } } /// \return true if the specified list of values has only one instruction that @@ -4582,6 +4896,8 @@ !isConstant(V); })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return false; @@ -4592,6 +4908,17 @@ }; InstructionsState S = getSameOpcode(VL); + // Don't vectorize ephemeral values. + if (!EphValues.empty()) { + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V + << ") is ephemeral.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return; + } + } + } // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of // a load), in which case peek through to include it in the tree, without @@ -4606,6 +4933,8 @@ cast(S.MainOp)->getOpcode(); })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -4694,6 +5023,7 @@ } return true; }; + bool IsSplat = isSplat(VL); SmallVector SortedIndices; BasicBlock *BB = nullptr; bool AreAllSameInsts = @@ -4713,13 +5043,16 @@ BB && sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, SortedIndices)); - if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts || + if (allConstant(VL) || IsSplat || !AreAllSameInsts || + (S.getOpcode() == Instruction::Load && UserTreeIdx.UserTE) || (isa( S.OpValue) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); - if (TryToFindDuplicates(S)) + if (IsSplat || TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); return; @@ -4727,19 +5060,6 @@ // We now know that this is a vector of instructions of the same type from // the same block. - - // Don't vectorize ephemeral values. - if (!EphValues.empty()) { - for (Value *V : VL) { - if (EphValues.count(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is ephemeral.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - } - } - // Check if this is a duplicate of another entry. if (TreeEntry *E = getTreeEntry(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); @@ -5043,6 +5363,8 @@ else LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); #endif // NDEBUG + if (UserTreeIdx.UserTE) + gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, GatheredLoads); break; } return; @@ -5809,6 +6131,60 @@ return I->getOpcode() == AltOp->getOpcode(); } +SmallVector BoUpSLP::tryToMatchVector( + const TreeEntry *E, + function_ref)> Callback) { + ArrayRef VL = E->Scalars; + // Check if these gather loads are part of the vectorized wide loads. + MapVector> Masks; + SmallVector GatheredVectors(VL.size(), + PoisonValue::get(VL.front()->getType())); + DenseMap> PathInfo; + for (auto [I, V] : enumerate(VL)) { + if (TreeEntry *TE = getTreeEntry(V)) { + if (!isPathPart(E, TE, PathInfo)) { + auto It = Masks.find(TE); + if (It == Masks.end()) { + It = Masks + .insert(std::make_pair( + TE, SmallVector(VL.size(), UndefMaskElem))) + .first; + } + It->second[I] = TE->findLaneForValue(V); + continue; + } + } + GatheredVectors[I] = V; + } + // Perform callback transformation/analysis on the built masks/entries. + if (!Masks.empty()) { + if (Masks.size() == 1) { + Callback(Masks.begin()->first, Masks.begin()->second); + } else { + Callback(Masks.begin()->first, None); + SmallVector &Mask = Masks.begin()->second; + bool AdjustMask = false; + unsigned VF = Masks.begin()->first->getVectorFactor(); + for (const auto &Data : drop_begin(Masks)) { + VF = std::max(VF, Data.first->getVectorFactor()); + for (int I = 0, End = Data.second.size(); I < End; ++I) { + int Idx = Data.second[I]; + if (Idx != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Mask is used already."); + Mask[I] = VF + Idx; + } else if (AdjustMask && Mask[I] != UndefMaskElem) { + Mask[I] = I; + } + } + Callback(Data.first, Mask); + AdjustMask = true; + VF = Data.second.size(); + } + } + } + return GatheredVectors; +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -5917,160 +6293,267 @@ return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); + InstructionCost ReuseShuffleCost = 0; + if (NeedToShuffleReuses) + ReuseShuffleCost = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); + InstructionCost GatherCost = 0; + // Check if these gathers are part of the vectorized nodes. + bool MultipleShuffles = false; + unsigned FirstVF = 0; + SmallVector Gathers = tryToMatchVector(E, [this, VecTy, + &MultipleShuffles, + &GatherCost, &FirstVF]( + TreeEntry *TE, + ArrayRef Mask) { + if (Mask.empty()) { + // This is the first element of multiple (>2) shuffles of loads. + MultipleShuffles = true; + FirstVF = TE->getVectorFactor(); + } else if (!MultipleShuffles) { + unsigned VF = TE->getVectorFactor(); + int Limit = VF * 2; + if (VF != Mask.size() || + (all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(Mask))) { + int Index; + int NumElts; + auto *OriginalVecTy = + FixedVectorType::get(VecTy->getElementType(), VF); + if ((VF > Mask.size() && + ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) && + Index % Mask.size() != 0) || + (VF < Mask.size() && + ShuffleVectorInst::isInsertSubvectorMask(Mask, VF, NumElts, + Index) && + Index % NumElts != 0)) { + GatherCost += TTI->getShuffleCost( + VF > Mask.size() ? TTI::SK_ExtractSubvector + : TTI::SK_InsertSubvector, + VF > Mask.size() ? OriginalVecTy : VecTy, None, Index, + VF > Mask.size() ? VecTy : OriginalVecTy); + } + } + } else { + FixedVectorType *VTy = nullptr; + if (FirstVF > 0) { + if (FirstVF != TE->getVectorFactor()) { + VTy = + FixedVectorType::get(VecTy->getElementType(), + std::max(FirstVF, TE->getVectorFactor())); + } else { + VTy = FixedVectorType::get(VecTy->getElementType(), FirstVF); + } + FirstVF = 0; + } else if (TE->getVectorFactor() != VecTy->getNumElements()) { + // Resize next vector to VecTy. + VTy = FixedVectorType::get( + VecTy->getElementType(), + std::max(TE->getVectorFactor(), VecTy->getNumElements())); + } else { + VTy = VecTy; + } + if (VTy->getNumElements() <= VecTy->getNumElements()) { + unsigned Idx = 0; + InstructionCost InsertCost = 0; + // Check if we can replace just by insert subvector sequences. + auto &&CheckForInsertSubMask = + [VTy, &InsertCost, VecTy, this](int Idx, ArrayRef SubMask) { + unsigned Bound = Idx; + for (auto [Pos, I] : enumerate(SubMask)) { + if (I == UndefMaskElem) + continue; + if (I - Bound != Pos) + return false; + } + if (Idx != 0) + InsertCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, + VecTy, None, Idx, VTy); + return true; + }; + // Check if we can replace just by extract/insert subvector + // sequences. + auto &&CheckForPermuteSubMask = [this, VTy, &InsertCost, Mask]( + int Idx, ArrayRef SubMask) { + unsigned LocalVF = VTy->getNumElements(); + if (LocalVF <= 1) + return false; + unsigned SubVF = LocalVF / 2; + auto *SubTy = FixedVectorType::get(VTy->getElementType(), SubVF); + unsigned FirstPos = 0; + for (auto [Pos, I] : enumerate(SubMask)) { + if (Pos == SubVF) { + if (SubMask[FirstPos] > 0) { + int ExtractIdx = (SubMask[FirstPos] % Mask.size()) % LocalVF; + if (ExtractIdx + SubVF > LocalVF) + return false; + InsertCost += + TTI->getShuffleCost(TTI::SK_ExtractSubvector, VTy, None, + ExtractIdx, SubTy); + } + FirstPos = Pos; + } + if (I == UndefMaskElem) + continue; + if (static_cast(I) != + SubMask[FirstPos] + Pos - FirstPos) + return false; + } + if (SubMask[FirstPos] > 0) { + int ExtractIdx = (SubMask[FirstPos] % Mask.size()) % LocalVF; + if (ExtractIdx + SubVF > LocalVF) + return false; + InsertCost += TTI->getShuffleCost(TTI::SK_ExtractSubvector, VTy, + None, ExtractIdx, SubTy); + } + InsertCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VTy, + None, FirstPos, SubTy); + return true; + }; + + while (Idx < Mask.size()) { + SmallVector SubMask(Mask.slice(Idx, VTy->getNumElements())); + if (CheckForInsertSubMask(Idx, SubMask) || + all_of(SubMask, [](int Idx) { return Idx == UndefMaskElem; }) || + CheckForPermuteSubMask(Idx, SubMask)) { + Idx += VTy->getNumElements(); + continue; + } + break; + } + if (Idx == Mask.size()) { + GatherCost += InsertCost; + } else { + GatherCost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VTy, Mask); + } + } else { + GatherCost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VTy, Mask); + } + } + }); + if (all_of(Gathers, PoisonValue::classof)) + return ReuseShuffleCost + GatherCost; + // Check if need to shuffle 2 vectors - gathered subvector and the shuffled + // vector. + bool HasShuffledGathers = Gathers != VL; SmallVector Mask; SmallVector Entries; Optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); + isGatherShuffledEntry(E, Gathers, Mask, Entries); if (Shuffle) { - InstructionCost GatherCost = 0; - if (ShuffleVectorInst::isIdentityMask(Mask)) { + // Need to shuffle 2 vectors - gathered subvector and the shuffled vector. + if (HasShuffledGathers) { + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Gathers[I] != VL[I]) { + assert(Mask[I] == UndefMaskElem && + "Expected undefined mask element."); + Mask[I] = Sz + I; + } + } + GatherCost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask); + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) + if (Mask[I] != UndefMaskElem) + Mask[I] = I; + } + if (!HasShuffledGathers && ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( dbgs() << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); + << *Gathers.front() << ".\n"); if (NeedToShuffleReuses) - GatherCost = + GatherCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); } else { LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle that starts with " - << *VL.front() << ".\n"); + << *Gathers.front() << ".\n"); // Detected that instead of gather we can emit a shuffle of single/two // previously vectorized nodes. Add the cost of the permutation rather // than gather. ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + GatherCost += TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + } + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + Gathers[I] = PoisonValue::get(ScalarTy); } - return GatherCost; } + if (all_of(Gathers, PoisonValue::classof)) + return (HasShuffledGathers ? 0 : ReuseShuffleCost) + GatherCost; if ((E->getOpcode() == Instruction::ExtractElement || - all_of(E->Scalars, + all_of(Gathers, [](Value *V) { return isa(V); })) && - allSameType(VL)) { + allSameType(Gathers)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector Mask; Optional ShuffleKind = - isFixedVectorShuffle(VL, Mask); + isFixedVectorShuffle(Gathers, Mask); if (ShuffleKind) { + // Need to shuffle 2 vectors - gathered subvector and the shuffled + // vector. + if (HasShuffledGathers) { + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Gathers[I] != VL[I]) { + assert(Mask[I] == UndefMaskElem && + "Expected undefined mask element."); + Mask[I] = Sz + I; + } + } + GatherCost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask); + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) + if (Mask[I] != UndefMaskElem) + Mask[I] = I; + } // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. InstructionCost Cost = - computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); + computeExtractCost(Gathers, VecTy, *ShuffleKind, Mask, *TTI); AdjustExtractsCost(Cost); - if (NeedToShuffleReuses) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - return Cost; - } - } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - assert(VecTy == FinalVecTy && - "No reused scalars expected for broadcast."); - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/None, /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/VL[0]); - } - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) - ReuseShuffleCost = TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - if (VL.size() > 2 && E->getOpcode() == Instruction::Load && - !E->isAltShuffle()) { - BoUpSLP::ValueSet VectorizedLoads; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); - for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, - CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; - else - ++ScatterVectorizeCnt; - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - InstructionCost GatherCost = 0; - unsigned NumParts = TTI->getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += getGatherCost(VL.slice(I, VF)); - } - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += TTI->getMemoryOpCost( - Instruction::Load, LI->getType(), LI->getAlign(), - LI->getPointerAddressSpace(), CostKind, LI); - } - auto *LI = cast(E->getMainOp()); - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, LI); - GatherCost += ScatterVectorizeCnt * - TTI->getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - for (int I = VF, E = VL.size(); I < E; I += VF) - GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - None, I, LoadTy); - } - return ReuseShuffleCost + GatherCost - ScalarsCost; - } - } - return ReuseShuffleCost + getGatherCost(VL); + GatherCost += Cost; + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + Gathers[I] = PoisonValue::get(ScalarTy); + } + } + } + if (all_of(Gathers, PoisonValue::classof)) + return GatherCost + ReuseShuffleCost; + bool AllConstant = allConstant(Gathers); + bool IsSplat = !AllConstant && isSplat(Gathers) && + count_if(Gathers, [](Value *V) { + return isa(V); + }) < static_cast(Gathers.size() - 1); + if (Gathers != VL) { + GatherCost += ReuseShuffleCost; + if (AllConstant || IsSplat) { + Mask.assign(Gathers.size(), UndefMaskElem); + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) + Mask[I] = I + (Gathers[I] != VL[I] ? Sz : 0); + // The cost is gather cost + broadcast (if splat) + shuffle with the + // gather + (possible) extension for reused scalars. + GatherCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + VecTy, Mask); + } + } + // For single remaining element just gather it. + if (AllConstant || IsSplat) { + // The cost is gather cost + broadcast (if splat) + shuffle with the + // gather + (possible) extension for reused scalars. + return GatherCost + (AllConstant + ? 0 + : TTI->getShuffleCost( + TargetTransformInfo::SK_Broadcast, VecTy)); + } + return GatherCost + getGatherCost(Gathers); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -6281,10 +6764,6 @@ InstructionCost ScalarEltCost = TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } - // Calculate the cost of this instruction. InstructionCost ScalarCost = VL.size() * ScalarEltCost; @@ -6306,9 +6785,6 @@ InstructionCost ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; @@ -6410,9 +6886,6 @@ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, @@ -6435,9 +6908,6 @@ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost( Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); @@ -6449,9 +6919,6 @@ Align Alignment = cast(VL0)->getAlign(); InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { @@ -6492,9 +6959,6 @@ IntrinsicCostAttributes CostAttrs(ID, *CI, 1); InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - if (NeedToShuffleReuses) { - CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; - } InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); @@ -6516,16 +6980,6 @@ (isa(VL0) && isa(E->getAltOp()))) && "Invalid Shuffle Vector Operand"); InstructionCost ScalarCost = 0; - if (NeedToShuffleReuses) { - for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); - CommonCost -= TTI->getInstructionCost(I, CostKind); - } - for (Value *V : VL) { - Instruction *I = cast(V); - CommonCost += TTI->getInstructionCost(I, CostKind); - } - } for (Value *V : VL) { Instruction *I = cast(V); assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); @@ -6633,7 +7087,13 @@ VectorizableTree[0]->getVectorFactor() > 2))) return true; - if (VectorizableTree.size() != 2) + if (VectorizableTree.size() != 2 && GatheredLoadsEntriesFirst == -1) + return false; + if (VectorizableTree.size() != 3 && GatheredLoadsEntriesFirst >= 2 && + all_of(makeArrayRef(VectorizableTree).drop_front(), + [](const std::unique_ptr &TE) { + return TE->Vectorize == TreeEntry::NeedToGather; + })) return false; // Handle splat and all-constants stores. Also try to vectorize tiny trees @@ -6719,8 +7179,18 @@ } bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { + bool AreAllLoadsGathers = [this]() { + return GatheredLoadsEntriesFirst >= 0 && + all_of(make_range(std::next(VectorizableTree.begin(), + GatheredLoadsEntriesFirst), + VectorizableTree.end()), + [](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather; + }); + }(); // No need to vectorize inserts of gathered values. - if (VectorizableTree.size() == 2 && + if (((VectorizableTree.size() == 2 && GatheredLoadsEntriesFirst == -1) || + (GatheredLoadsEntriesFirst == 2 && AreAllLoadsGathers)) && isa(VectorizableTree[0]->Scalars[0]) && VectorizableTree[1]->State == TreeEntry::NeedToGather && (VectorizableTree[1]->getVectorFactor() <= 2 || @@ -6730,7 +7200,10 @@ // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. - if (VectorizableTree.size() >= MinTreeSize) + if (VectorizableTree.size() >= MinTreeSize && + (GatheredLoadsEntriesFirst == -1 || + (GatheredLoadsEntriesFirst >= static_cast(MinTreeSize) && + !AreAllLoadsGathers))) return false; // If we have a tiny tree (a tree whose size is less than MinTreeSize), we @@ -6742,6 +7215,19 @@ ? ExternalUses.empty() : true && "We shouldn't have any external users"); + // Check if any of the gather node forms a buildvector somewhere. + if (any_of(VectorizableTree, [](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && + all_of(TE->Scalars, [](Value *V) { + return isa(V) || + (!V->hasNUsesOrMore(8) && + any_of(V->users(), [](User *U) { + return isa(U); + })); + }); + })) + return false; + // Otherwise, we can't vectorize the tree. It is both tiny and not fully // vectorizable. return true; @@ -7040,6 +7526,21 @@ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I]; + // Exclude cost of gather loads nodes which are not used. These nodes were + // built as part of the final attempt to vectorize gathered loads. + if (GatheredLoadsEntriesFirst >= 0 && + I >= static_cast(GatheredLoadsEntriesFirst) && + TE.State == TreeEntry::NeedToGather) { + assert(all_of(TE.Scalars, + [this](Value *V) { + return (isa(V) && MustGather.contains(V)) || + isa(V) || + V->getType()->isPtrOrPtrVectorTy(); + }) && + "Expected loads, pointers or constants only."); + continue; + } + InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C @@ -7242,11 +7743,12 @@ } Optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef Scalars, + SmallVectorImpl &Mask, SmallVectorImpl &Entries) { // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - Mask.assign(TE->Scalars.size(), UndefMaskElem); + Mask.assign(Scalars.size(), UndefMaskElem); Entries.clear(); // Build a lists of values to tree entries. DenseMap> ValueToTEs; @@ -7265,9 +7767,10 @@ // tree node for each gathered value - we have just a permutation of the // single vector. If we have 2 different sets, we're in situation where we // have a permutation of 2 input vectors. + DenseMap> PathInfo; SmallVector> UsedTEs; DenseMap UsedValuesEntry; - for (Value *V : TE->Scalars) { + for (Value *V : Scalars) { if (isa(V)) continue; // Build a list of tree entries where V is used. @@ -7276,7 +7779,8 @@ if (It != ValueToTEs.end()) VToTEs = It->second; if (const TreeEntry *VTE = getTreeEntry(V)) - VToTEs.insert(VTE); + if (!isPathPart(TE, VTE, PathInfo)) + VToTEs.insert(VTE); if (VToTEs.empty()) return None; if (UsedTEs.empty()) { @@ -7316,7 +7820,7 @@ } if (UsedTEs.empty()) { - assert(all_of(TE->Scalars, UndefValue::classof) && + assert(all_of(Scalars, UndefValue::classof) && "Expected vector of undefs only."); return None; } @@ -7324,8 +7828,8 @@ unsigned VF = 0; if (UsedTEs.size() == 1) { // Try to find the perfect match in another gather node at first. - auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { - return EntryPtr->isSame(TE->Scalars); + auto It = find_if(UsedTEs.front(), [Scalars](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(Scalars); }); if (It != UsedTEs.front().end()) { Entries.push_back(*It); @@ -7356,8 +7860,8 @@ } // Build a shuffle mask for better cost estimation and vector emission. - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { - Value *V = TE->Scalars[I]; + int Limit = Scalars.size() * 2; + for (auto [I, V] : enumerate(Scalars)) { if (isa(V)) continue; unsigned Idx = UsedValuesEntry.lookup(V); @@ -7366,7 +7870,7 @@ Mask[I] = Idx * VF + FoundLane; // Extra check required by isSingleSourceMaskImpl function (called by // ShuffleVectorInst::isSingleSourceMask). - if (Mask[I] >= 2 * E) + if (Mask[I] >= Limit) return None; } switch (Entries.size()) { @@ -7754,16 +8258,18 @@ } } - // Can't vectorize this, so simply build a new vector with each lane - // corresponding to the requested value. - return createBuildVector(VL); + auto *I = + find_if(VectorizableTree, [VL](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && + (TE->ReuseShuffleIndices.empty() || + TE->ReuseShuffleIndices.size() == VL.size()) && + TE->isSame(VL); + }); + assert(I != VectorizableTree.end() && "Gather node is not in the graph."); + return vectorizeTree(I->get()); } + Value *BoUpSLP::createBuildVector(ArrayRef VL) { - assert(any_of(VectorizableTree, - [VL](const std::unique_ptr &TE) { - return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); - }) && - "Non-matching gather node."); unsigned VF = VL.size(); // Exploit possible reuse of values across lanes. SmallVector ReuseShuffleIndicies; @@ -7829,32 +8335,399 @@ return E->VectorizedValue; } - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, CSEBlocks); if (E->State == TreeEntry::NeedToGather) { - if (E->getMainOp()) + if (allConstant(E->Scalars)) { + Value *Vec = createBuildVector(E->Scalars); + E->VectorizedValue = Vec; + return Vec; + } + // Checks if the mask is an identity mask. + auto &&IsIdentityMask = [](ArrayRef Mask, FixedVectorType *VecTy) { + int Limit = Mask.size(); + return VecTy->getNumElements() == Mask.size() && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask); + }; + // Tries to combine 2 different masks into single one. + auto &&CombineMasks = [](SmallVectorImpl &Mask, ArrayRef ExtMask) { + SmallVector NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + NewMask[I] = Mask[ExtMask[I]]; + } + Mask.swap(NewMask); + }; + // Smart shuffle instruction emission, walks through shuffles trees and + // tries to find the best matching vector for the actual shuffle + // instruction. + auto &&CreateShuffle = [this, &IsIdentityMask, + &CombineMasks](Value *V1, Value *V2, + ArrayRef Mask) -> Value * { + assert(V1 && "Expected at least one vector value."); + if (V2 && !isUndefVector(V2)) { + Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + if (isa(V1)) + return PoisonValue::get(FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size())); + Value *Op = V1; + SmallVector CombinedMask(Mask.begin(), Mask.end()); + while (auto *SV = dyn_cast(Op)) { + // Exit if not a fixed vector type or changing size shuffle. + if (!isa(SV->getType())) + break; + // Exit if the identity or broadcast mask is found. + if (IsIdentityMask(CombinedMask, cast(SV->getType()))) + break; + bool IsOp1Undef = isUndefVector(SV->getOperand(0)); + bool IsOp2Undef = isUndefVector(SV->getOperand(1)); + if (!IsOp1Undef && !IsOp2Undef) + break; + SmallVector ShuffleMask(SV->getShuffleMask().begin(), + SV->getShuffleMask().end()); + CombineMasks(ShuffleMask, CombinedMask); + CombinedMask.swap(ShuffleMask); + if (IsOp2Undef) + Op = SV->getOperand(0); + else + Op = SV->getOperand(1); + } + if (!isa(Op->getType()) || + !IsIdentityMask(CombinedMask, cast(Op->getType()))) { + Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + } + return Op; + }; + + // Can set insert point safely on for the initial gather node. + if (E == VectorizableTree.front().get() && E->getMainOp()) setInsertPointAfterBundle(E); - Value *Vec; + auto &&SetInsertPoint = [this](Value *Vec1, Value *Vec2 = nullptr) { + Instruction *LastInst = nullptr; + auto *IVec1 = dyn_cast(Vec1); + auto *IVec2 = dyn_cast_or_null(Vec2); + if (IVec1 && IVec2) { + if (IVec1->getParent() != Builder.GetInsertBlock() && + IVec2->getParent() != Builder.GetInsertBlock()) + return; + LastInst = ((IVec1->getParent() != IVec2->getParent() && + IVec1->getParent() == Builder.GetInsertBlock()) || + (IVec1->getParent() == IVec2->getParent() && + IVec2->comesBefore(IVec1))) + ? IVec1 + : IVec2; + } else if (IVec1) { + LastInst = IVec1; + } else if (IVec2) { + LastInst = IVec2; + } else { + return; + } + // Set the insertion point after the last instruction in the bundle. Set + // the debug location to Front. + Builder.SetInsertPoint(LastInst->getParent(), ++LastInst->getIterator()); + Builder.SetCurrentDebugLocation(LastInst->getDebugLoc()); + }; + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); + Value *Vec = nullptr; + // Also, check if the gathered loads can be shuffled out of final vector + // loads. + std::pair> SingleVecMask; + SingleVecMask.first = nullptr; + // Check if these gathers are part of the vectorized nodes. + bool MultipleShuffles = false; + SmallVector GatheredScalars = tryToMatchVector( + E, [this, &SetInsertPoint, &MultipleShuffles, &SingleVecMask, &Vec, + &CreateShuffle](TreeEntry *TE, ArrayRef Mask) { + if (Mask.empty()) { + // This is the first element of multiple (>2) shuffles of loads. + MultipleShuffles = true; + Vec = vectorizeTree(TE); + } else if (!MultipleShuffles) { + SingleVecMask.first = vectorizeTree(TE); + SingleVecMask.second.assign(Mask.begin(), Mask.end()); + } else { + unsigned VecVF = + cast(Vec->getType())->getNumElements(); + unsigned VecValVF = TE->getVectorFactor(); + // Adjust mask and generate smaller shuffle. + unsigned CommonVF = std::max(VecVF, VecValVF); + if (VecVF == VecValVF) { + Value *TEVec = vectorizeTree(TE); + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, TEVec); + Vec = CreateShuffle(Vec, TEVec, Mask); + } else if (VecVF != VecValVF) { + Value *TEVec = vectorizeTree(TE); + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, TEVec); + SmallVector ExpandMask(CommonVF, UndefMaskElem); + std::iota( + ExpandMask.begin(), + std::next(ExpandMask.begin(), std::min(VecVF, VecValVF)), 0); + if (VecVF < VecValVF) + Vec = CreateShuffle(Vec, nullptr, ExpandMask); + else + TEVec = CreateShuffle(TEVec, nullptr, ExpandMask); + Vec = CreateShuffle(Vec, TEVec, Mask); + } + } + }); SmallVector Mask; - SmallVector Entries; - Optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); + bool AreAllPoisons = all_of(GatheredScalars, PoisonValue::classof); + if (!AreAllPoisons) { + Type *ScalarTy = GatheredScalars.front()->getType(); + SmallVector Entries; + Optional Shuffle = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (Shuffle) { + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + Value *Front = vectorizeTree(const_cast(Entries.front())); + Value *Back = vectorizeTree(const_cast(Entries.back())); + if (Entries.size() == 1) { + if (!SingleVecMask.first) { + int Limit = Mask.size() * 2; + if (cast(Front->getType())->getNumElements() != + Mask.size() || + (all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(Mask))) { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Front); + Vec = CreateShuffle(Front, nullptr, Mask); + } else { + Vec = Front; + } + } else { + // Combine single masks into one common. + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first, Front); + // Resize to the required size, if the type is different. + if (SingleVecMask.first->getType() != Front->getType()) { + SingleVecMask.first = CreateShuffle(SingleVecMask.first, nullptr, + SingleVecMask.second); + for (int I = 0, Sz = SingleVecMask.second.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) + SingleVecMask.second[I] = I; + } + } + // Combine masks. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Element already used."); + Mask[I] = SingleVecMask.second[I]; + } else if (Mask[I] != UndefMaskElem) { + Mask[I] += Sz; + } + } + Vec = CreateShuffle(SingleVecMask.first, Front, Mask); + SingleVecMask.first = nullptr; + } + } else { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Front, Back); + Vec = CreateShuffle(Front, Back, Mask); + if (SingleVecMask.first) { + // Combine masks into one common. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Element already used."); + Mask[I] = SingleVecMask.second[I]; + } else if (Mask[I] != UndefMaskElem) { + Mask[I] = I + Sz; + } + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + Vec = CreateShuffle(SingleVecMask.first, Vec, Mask); + SingleVecMask.first = nullptr; + } + } + } + } + + Value *VecVal = nullptr; + if (!AreAllPoisons && + (E->getOpcode() == Instruction::ExtractElement || + all_of(GatheredScalars, + [](Value *V) { + return isa(V); + })) && + allSameType(GatheredScalars)) { + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + SmallVector Mask; + Optional ShuffleKind = + isFixedVectorShuffle(GatheredScalars, Mask); + if (ShuffleKind) { + // Find input vectors. + Value *Vec1 = nullptr; + Value *Vec2 = nullptr; + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] == UndefMaskElem) + continue; + auto *EI = cast(GatheredScalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } + } + if (Vec2) { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec1, Vec2); + VecVal = CreateShuffle(Vec1, Vec2, Mask); + } else if (!Vec1) { + // If extracts are all from undef vectors - just build poison vector. + VecVal = PoisonValue::get(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + } else if (GatheredScalars.size() != + cast(Vec1->getType()) + ->getNumElements() || + !ShuffleVectorInst::isIdentityMask(Mask)) { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec1); + VecVal = CreateShuffle(Vec1, nullptr, Mask); + } + } + } + if (!VecVal) { + IRBuilder<>::InsertPointGuard Guard(Builder); + auto &&SetInsertPointAfterOps = [this](ArrayRef VL) { + // The last instruction in the bundle in program order. + Instruction *LastInst = nullptr; + for (Value *V : VL) { + // If the value was vectorized, need to get the vector value for + // correct insert point. + if (const TreeEntry *TE = getTreeEntry(V)) + if (TE->VectorizedValue) + V = TE->VectorizedValue; + auto *I = dyn_cast(V); + if (!I) + continue; + if (!DT->isReachableFromEntry(I->getParent())) + continue; + if (!LastInst) { + LastInst = I; + continue; + } + if ((LastInst->getParent() != I->getParent() && + DT->dominates(LastInst->getParent(), I->getParent())) || + (LastInst->getParent() == I->getParent() && + LastInst->comesBefore(I))) + LastInst = I; + } + // Set the insertion point after the last instruction in the bundle. + // Set the debug location to Front. + if (!LastInst) + return; + if (isa(LastInst)) + Builder.SetInsertPoint(LastInst->getParent(), + LastInst->getParent()->getFirstInsertionPt()); + else + Builder.SetInsertPoint(LastInst->getParent(), + std::next(LastInst->getIterator())); + Builder.SetCurrentDebugLocation(LastInst->getDebugLoc()); + }; + SetInsertPointAfterOps(GatheredScalars); + VecVal = createBuildVector(GatheredScalars); + } + bool NeedToShuffleReuses = true; + if (Vec) { + assert(!SingleVecMask.first && "Unexpected single vectorized value."); + if (!all_of(GatheredScalars, UndefValue::classof)) { + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (!isa(GatheredScalars[I])) + Mask[I] = I + Sz; + else + Mask[I] = I; + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(Vec, VecVal); + Vec = CreateShuffle(Vec, VecVal, Mask); + } + } else if (SingleVecMask.first) { + if (all_of(GatheredScalars, UndefValue::classof)) { + int Limit = SingleVecMask.second.size() * 2; + if (cast(SingleVecMask.first->getType()) + ->getNumElements() != SingleVecMask.second.size() || + (all_of(SingleVecMask.second, + [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(SingleVecMask.second))) { + if (!ReuseShuffleIndicies.empty()) { + ShuffleBuilder.addMask(SingleVecMask.second); + ShuffleBuilder.addMask(ReuseShuffleIndicies); + Vec = ShuffleBuilder.finalize(SingleVecMask.first); + NeedToShuffleReuses = false; + } else { + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + Vec = CreateShuffle(SingleVecMask.first, nullptr, + SingleVecMask.second); + } + } else { + Vec = SingleVecMask.first; + } + } else { + Value *AdjustedVec; + unsigned VecSize = cast(SingleVecMask.first->getType()) + ->getNumElements(); + if (VecSize > GatheredScalars.size()) { + // Adjust vector value and generate smaller shuffle. + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + AdjustedVec = + CreateShuffle(SingleVecMask.first, nullptr, SingleVecMask.second); + for (int I = 0, Sz = SingleVecMask.second.size(); I < Sz; ++I) { + if (SingleVecMask.second[I] != UndefMaskElem) + SingleVecMask.second[I] = I; + } + } else if (VecSize < GatheredScalars.size()) { + // Adjust vector value and generate smaller shuffle. + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(SingleVecMask.first); + SmallVector AdjustMask(GatheredScalars.size(), UndefMaskElem); + std::iota(AdjustMask.begin(), std::next(AdjustMask.begin(), VecSize), + 0); + AdjustedVec = CreateShuffle(SingleVecMask.first, nullptr, AdjustMask); + } else { + AdjustedVec = SingleVecMask.first; + } + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (!isa(GatheredScalars[I])) + SingleVecMask.second[I] = I + Sz; + } + IRBuilder<>::InsertPointGuard Guard(Builder); + SetInsertPoint(AdjustedVec, VecVal); + Vec = CreateShuffle(AdjustedVec, VecVal, SingleVecMask.second); } } else { - Vec = gather(E->Scalars); + Vec = VecVal; } - if (NeedToShuffleReuses) { - ShuffleBuilder.addMask(E->ReuseShuffleIndices); + if (NeedToShuffleReuses && !ReuseShuffleIndicies.empty()) { + ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); } E->VectorizedValue = Vec; @@ -8438,6 +9311,14 @@ scheduleBlock(BSIter.second.get()); } + // Need to vectorize gathered loads independently since there are no direct + // users, only indirect ones, represented by gathered nodes. + for (const std::unique_ptr &TE : VectorizableTree) { + if (TE->getOpcode() == Instruction::Load && + TE->State != TreeEntry::NeedToGather && TE->UserTreeIndices.empty() && + TE.get() != VectorizableTree.front().get()) + TE->VectorizedValue = vectorizeTree(TE.get()); + } Builder.SetInsertPoint(&F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); diff --git a/llvm/test/Feature/weak_constant.ll b/llvm/test/Feature/weak_constant.ll --- a/llvm/test/Feature/weak_constant.ll +++ b/llvm/test/Feature/weak_constant.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -O3 -S > %t ; RUN: grep 'constant i32 undef' %t | count 1 ; RUN: grep 'constant i32 5' %t | count 1 -; RUN: grep 'i32 7' %t | count 1 +; RUN: grep '[{] i32 7' %t | count 1 ; RUN: grep 'i32 9' %t | count 1 %0 = type { i32, i32 } ; type %0 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -279,15 +279,15 @@ ; CHECK-LABEL: @cmp_lt_gt( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i64 1 ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -18,19 +18,16 @@ ; NOACCELERATE-LABEL: @int_sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -214,19 +211,16 @@ ; NOACCELERATE-LABEL: @exp_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -297,19 +291,16 @@ ; NOACCELERATE-LABEL: @log_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -473,19 +464,16 @@ ; NOACCELERATE-LABEL: @sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -515,19 +503,16 @@ ; NOACCELERATE-LABEL: @cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -1006,19 +991,16 @@ ; NOACCELERATE-LABEL: @int_cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -18,19 +18,16 @@ ; NOACCELERATE-LABEL: @int_sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -214,19 +211,16 @@ ; NOACCELERATE-LABEL: @exp_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -297,19 +291,16 @@ ; NOACCELERATE-LABEL: @log_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -473,19 +464,16 @@ ; NOACCELERATE-LABEL: @sin_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -515,19 +503,16 @@ ; NOACCELERATE-LABEL: @cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: @@ -1006,19 +991,16 @@ ; NOACCELERATE-LABEL: @int_cos_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) -; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP1]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -159,18 +159,18 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ] ; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[T4]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[T4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]] ; CHECK-NEXT: [[T7:%.*]] = or i32 [[T4]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[T7]] to i64 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[T7]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP5]] ; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i64 0 ; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-2 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s @@ -17,7 +17,7 @@ ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '8' +; YAML-NEXT: - TreeSize: '10' define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) { ; CHECK-LABEL: @test_select( @@ -139,7 +139,7 @@ ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '3' +; YAML-NEXT: - TreeSize: '5' ; CHECK-LABEL: @reduction_with_br( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP_16:%.*]] = icmp sgt i32 [[H:%.*]], 0 @@ -234,7 +234,7 @@ ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-36' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '10' +; YAML-NEXT: - TreeSize: '12' define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 { ; CHECK-LABEL: @test_unrolled_select( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll @@ -11,17 +11,17 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC:%.*]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP6]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[OFFSET]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC:%.*]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer -instcombine -mtriple=aarch64--linux-gnu < %s | FileCheck %s +; TODO: Need to tweak costs of the shuffles/insertvector/extractvector. + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" @@ -418,43 +420,44 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) -; CHECK-NEXT: ret i32 [[TMP29]] +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <8 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[P2]] to <8 x i8>* +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ADD_PTR]] to <8 x i8>* +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x i8>, <8 x i8>* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i8> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[ADD_PTR64]] to <8 x i8>* +; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x i8> [[TMP22]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x i8> [[TMP22]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = mul nuw nsw <4 x i32> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]]) +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP31]] +; CHECK-NEXT: ret i32 [[OP_RDX2]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -939,46 +942,42 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i32, i32* [[DST0:%.*]], i64 4 ; CHECK-NEXT: [[DST8:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 8 ; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds i32, i32* [[DST0]], i64 12 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <8 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[DST0]] to <4 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP10]], [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[P2]] to <8 x i8>* +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x i8>, <8 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[DST4]] to <4 x i32>* -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ADD_PTR]] to <8 x i8>* +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x i8>, <8 x i8>* [[TMP16]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP20]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[DST8]] to <4 x i32>* -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP28:%.*]] = load <4 x i8>, <4 x i8>* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP26]], [[TMP29]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[ADD_PTR64]] to <8 x i8>* +; CHECK-NEXT: [[TMP25:%.*]] = load <8 x i8>, <8 x i8>* [[TMP24]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i8> [[TMP25]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i8> [[TMP25]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP26]] to <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP28]], [[TMP29]] ; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 @@ -1237,104 +1236,308 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1:%.*]] to <8 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[P2:%.*]] to <8 x i8>* +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = sub nsw <2 x i32> [[TMP6]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = zext <2 x i8> [[TMP5]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = sub nsw <2 x i32> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = shl nsw <2 x i32> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = add nsw <2 x i32> [[TMP18]], [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <2 x i32> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP3]] to <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = sub nsw <2 x i32> [[TMP23]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = shl nsw <2 x i32> [[TMP25]], +; CHECK-NEXT: [[TMP27:%.*]] = add nsw <2 x i32> [[TMP26]], [[TMP22]] +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <2 x i32> [[TMP19]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP28]], i64 0 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP28]], i64 1 +; CHECK-NEXT: [[ADD55:%.*]] = add nsw i32 [[TMP30]], [[TMP29]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i8* [[ADD_PTR]] to <8 x i8>* +; CHECK-NEXT: [[TMP32:%.*]] = load <8 x i8>, <8 x i8>* [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <8 x i8> [[TMP32]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <8 x i8> [[TMP32]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <8 x i8> [[TMP32]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <8 x i8> [[TMP32]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast i8* [[ADD_PTR64]] to <8 x i8>* +; CHECK-NEXT: [[TMP39:%.*]] = load <8 x i8>, <8 x i8>* [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <8 x i8> [[TMP39]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <8 x i8> [[TMP39]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <8 x i8> [[TMP39]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <8 x i8> [[TMP39]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = sub nsw <2 x i32> [[TMP37]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP43]] to <2 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = sub nsw <2 x i32> [[TMP46]], [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = shl nsw <2 x i32> [[TMP48]], +; CHECK-NEXT: [[TMP50:%.*]] = add nsw <2 x i32> [[TMP49]], [[TMP45]] +; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <2 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <2 x i32> [[TMP54]], [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = shl nsw <2 x i32> [[TMP56]], +; CHECK-NEXT: [[TMP58:%.*]] = add nsw <2 x i32> [[TMP57]], [[TMP53]] +; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <2 x i32> [[TMP50]], [[TMP58]] +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP59]], i64 0 +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP59]], i64 1 +; CHECK-NEXT: [[ADD55_1:%.*]] = add nsw i32 [[TMP61]], [[TMP60]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i8* [[ADD_PTR_1]] to <8 x i8>* +; CHECK-NEXT: [[TMP63:%.*]] = load <8 x i8>, <8 x i8>* [[TMP62]], align 1 +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <8 x i8> [[TMP63]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <8 x i8> [[TMP63]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <8 x i8> [[TMP63]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <8 x i8> [[TMP63]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <8 x i8>* +; CHECK-NEXT: [[TMP70:%.*]] = load <8 x i8>, <8 x i8>* [[TMP69]], align 1 +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <8 x i8> [[TMP70]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <8 x i8> [[TMP70]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <8 x i8> [[TMP70]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <8 x i8> [[TMP70]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = sub nsw <2 x i32> [[TMP68]], [[TMP75]] +; CHECK-NEXT: [[TMP77:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> +; CHECK-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32> +; CHECK-NEXT: [[TMP79:%.*]] = sub nsw <2 x i32> [[TMP77]], [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = shl nsw <2 x i32> [[TMP79]], +; CHECK-NEXT: [[TMP81:%.*]] = add nsw <2 x i32> [[TMP80]], [[TMP76]] +; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> [[TMP81]], <2 x i32> +; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> [[TMP81]], <2 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32> +; CHECK-NEXT: [[TMP86:%.*]] = sub nsw <2 x i32> [[TMP84]], [[TMP85]] +; CHECK-NEXT: [[TMP87:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> +; CHECK-NEXT: [[TMP88:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32> +; CHECK-NEXT: [[TMP89:%.*]] = sub nsw <2 x i32> [[TMP87]], [[TMP88]] +; CHECK-NEXT: [[TMP90:%.*]] = shl nsw <2 x i32> [[TMP89]], +; CHECK-NEXT: [[TMP91:%.*]] = add nsw <2 x i32> [[TMP90]], [[TMP86]] +; CHECK-NEXT: [[TMP92:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> [[TMP91]], <2 x i32> +; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> [[TMP91]], <2 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = sub nsw <2 x i32> [[TMP81]], [[TMP91]] +; CHECK-NEXT: [[TMP95:%.*]] = shufflevector <2 x i32> [[TMP94]], <2 x i32> [[TMP28]], <2 x i32> +; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <2 x i32> [[TMP94]], <2 x i32> [[TMP28]], <2 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = extractelement <2 x i32> [[TMP94]], i64 0 +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP94]], i64 1 +; CHECK-NEXT: [[ADD55_2:%.*]] = add nsw i32 [[TMP98]], [[TMP97]] ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, <4 x i8>* [[TMP12]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ADD_PTR_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX3_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX5_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = sub nsw <16 x i32> [[TMP31]], [[TMP39]] -; CHECK-NEXT: [[TMP41:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, <4 x i8>* [[TMP41]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP42]], <4 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP50]], <4 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> -; CHECK-NEXT: [[TMP76:%.*]] = lshr <16 x i32> [[TMP75]], -; CHECK-NEXT: [[TMP77:%.*]] = and <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = add <16 x i32> [[TMP78]], [[TMP75]] -; CHECK-NEXT: [[TMP80:%.*]] = xor <16 x i32> [[TMP79]], [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP80]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP81]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP81]], 16 +; CHECK-NEXT: [[TMP99:%.*]] = bitcast i8* [[ADD_PTR_2]] to <8 x i8>* +; CHECK-NEXT: [[TMP100:%.*]] = load <8 x i8>, <8 x i8>* [[TMP99]], align 1 +; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <8 x i8> [[TMP100]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <8 x i8> [[TMP100]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <8 x i8> [[TMP100]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <8 x i8> [[TMP100]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <8 x i8>* +; CHECK-NEXT: [[TMP107:%.*]] = load <8 x i8>, <8 x i8>* [[TMP106]], align 1 +; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <8 x i8> [[TMP107]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <8 x i8> [[TMP107]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <8 x i8> [[TMP107]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = shufflevector <8 x i8> [[TMP107]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32> +; CHECK-NEXT: [[TMP113:%.*]] = sub nsw <2 x i32> [[TMP105]], [[TMP112]] +; CHECK-NEXT: [[TMP114:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> +; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32> +; CHECK-NEXT: [[TMP116:%.*]] = sub nsw <2 x i32> [[TMP114]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = shl nsw <2 x i32> [[TMP116]], +; CHECK-NEXT: [[TMP118:%.*]] = add nsw <2 x i32> [[TMP117]], [[TMP113]] +; CHECK-NEXT: [[TMP119:%.*]] = shufflevector <2 x i32> [[TMP50]], <2 x i32> [[TMP118]], <2 x i32> +; CHECK-NEXT: [[TMP120:%.*]] = shufflevector <2 x i32> [[TMP50]], <2 x i32> [[TMP118]], <2 x i32> +; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP101]] to <2 x i32> +; CHECK-NEXT: [[TMP122:%.*]] = zext <2 x i8> [[TMP108]] to <2 x i32> +; CHECK-NEXT: [[TMP123:%.*]] = sub nsw <2 x i32> [[TMP121]], [[TMP122]] +; CHECK-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32> +; CHECK-NEXT: [[TMP125:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> +; CHECK-NEXT: [[TMP126:%.*]] = sub nsw <2 x i32> [[TMP124]], [[TMP125]] +; CHECK-NEXT: [[TMP127:%.*]] = shl nsw <2 x i32> [[TMP126]], +; CHECK-NEXT: [[TMP128:%.*]] = add nsw <2 x i32> [[TMP127]], [[TMP123]] +; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> [[TMP128]], <2 x i32> +; CHECK-NEXT: [[TMP130:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> [[TMP128]], <2 x i32> +; CHECK-NEXT: [[TMP131:%.*]] = sub nsw <2 x i32> [[TMP118]], [[TMP128]] +; CHECK-NEXT: [[TMP132:%.*]] = shufflevector <2 x i32> [[TMP131]], <2 x i32> [[TMP59]], <2 x i32> +; CHECK-NEXT: [[TMP133:%.*]] = shufflevector <2 x i32> [[TMP131]], <2 x i32> [[TMP59]], <2 x i32> +; CHECK-NEXT: [[TMP134:%.*]] = extractelement <2 x i32> [[TMP131]], i64 0 +; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP131]], i64 1 +; CHECK-NEXT: [[ADD55_3:%.*]] = add nsw i32 [[TMP135]], [[TMP134]] +; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 [[ADD55_1]], [[ADD55]] +; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 [[ADD55]], [[ADD55_1]] +; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 [[ADD55_3]], [[ADD55_2]] +; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 [[ADD55_2]], [[ADD55_3]] +; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] +; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 +; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 +; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 +; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] +; CHECK-NEXT: [[SHR_I184_1:%.*]] = lshr i32 [[ADD105_1]], 15 +; CHECK-NEXT: [[AND_I185_1:%.*]] = and i32 [[SHR_I184_1]], 65537 +; CHECK-NEXT: [[MUL_I186_1:%.*]] = mul nuw i32 [[AND_I185_1]], 65535 +; CHECK-NEXT: [[ADD_I187_1:%.*]] = add i32 [[MUL_I186_1]], [[ADD105_1]] +; CHECK-NEXT: [[XOR_I188_1:%.*]] = xor i32 [[ADD_I187_1]], [[MUL_I186_1]] +; CHECK-NEXT: [[SHR_I189_1:%.*]] = lshr i32 [[SUB104_1]], 15 +; CHECK-NEXT: [[AND_I190_1:%.*]] = and i32 [[SHR_I189_1]], 65537 +; CHECK-NEXT: [[MUL_I191_1:%.*]] = mul nuw i32 [[AND_I190_1]], 65535 +; CHECK-NEXT: [[ADD_I192_1:%.*]] = add i32 [[MUL_I191_1]], [[SUB104_1]] +; CHECK-NEXT: [[XOR_I193_1:%.*]] = xor i32 [[ADD_I192_1]], [[MUL_I191_1]] +; CHECK-NEXT: [[SHR_I194_1:%.*]] = lshr i32 [[SUB106_1]], 15 +; CHECK-NEXT: [[AND_I195_1:%.*]] = and i32 [[SHR_I194_1]], 65537 +; CHECK-NEXT: [[MUL_I196_1:%.*]] = mul nuw i32 [[AND_I195_1]], 65535 +; CHECK-NEXT: [[ADD_I197_1:%.*]] = add i32 [[MUL_I196_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I198_1:%.*]] = xor i32 [[ADD_I197_1]], [[MUL_I196_1]] +; CHECK-NEXT: [[TMP136:%.*]] = add nsw <2 x i32> [[TMP93]], [[TMP83]] +; CHECK-NEXT: [[TMP137:%.*]] = add nsw <2 x i32> [[TMP92]], [[TMP82]] +; CHECK-NEXT: [[TMP138:%.*]] = extractelement <2 x i32> [[TMP136]], i64 0 +; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP137]], i64 0 +; CHECK-NEXT: [[ADD48:%.*]] = add nsw i32 [[TMP139]], [[TMP138]] +; CHECK-NEXT: [[TMP140:%.*]] = add nsw <2 x i32> [[TMP130]], [[TMP120]] +; CHECK-NEXT: [[TMP141:%.*]] = add nsw <2 x i32> [[TMP129]], [[TMP119]] +; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP140]], i64 0 +; CHECK-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i64 0 +; CHECK-NEXT: [[ADD48_1:%.*]] = add nsw i32 [[TMP143]], [[TMP142]] +; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i64 1 +; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP137]], i64 1 +; CHECK-NEXT: [[ADD48_2:%.*]] = add nsw i32 [[TMP145]], [[TMP144]] +; CHECK-NEXT: [[TMP146:%.*]] = sub nsw <2 x i32> [[TMP136]], [[TMP137]] +; CHECK-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP140]], i64 1 +; CHECK-NEXT: [[TMP148:%.*]] = extractelement <2 x i32> [[TMP141]], i64 1 +; CHECK-NEXT: [[ADD48_3:%.*]] = add nsw i32 [[TMP148]], [[TMP147]] +; CHECK-NEXT: [[TMP149:%.*]] = sub nsw <2 x i32> [[TMP140]], [[TMP141]] +; CHECK-NEXT: [[ADD78:%.*]] = add nsw i32 [[ADD48_1]], [[ADD48]] +; CHECK-NEXT: [[SUB86:%.*]] = sub nsw i32 [[ADD48]], [[ADD48_1]] +; CHECK-NEXT: [[ADD94:%.*]] = add nsw i32 [[ADD48_3]], [[ADD48_2]] +; CHECK-NEXT: [[SUB102:%.*]] = sub nsw i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 [[ADD94]], [[ADD78]] +; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 [[ADD78]], [[ADD94]] +; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 [[SUB102]], [[SUB86]] +; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 [[SUB86]], [[SUB102]] +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] +; CHECK-NEXT: [[SHR_I184:%.*]] = lshr i32 [[ADD105]], 15 +; CHECK-NEXT: [[AND_I185:%.*]] = and i32 [[SHR_I184]], 65537 +; CHECK-NEXT: [[MUL_I186:%.*]] = mul nuw i32 [[AND_I185]], 65535 +; CHECK-NEXT: [[ADD_I187:%.*]] = add i32 [[MUL_I186]], [[ADD105]] +; CHECK-NEXT: [[XOR_I188:%.*]] = xor i32 [[ADD_I187]], [[MUL_I186]] +; CHECK-NEXT: [[SHR_I189:%.*]] = lshr i32 [[SUB104]], 15 +; CHECK-NEXT: [[AND_I190:%.*]] = and i32 [[SHR_I189]], 65537 +; CHECK-NEXT: [[MUL_I191:%.*]] = mul nuw i32 [[AND_I190]], 65535 +; CHECK-NEXT: [[ADD_I192:%.*]] = add i32 [[MUL_I191]], [[SUB104]] +; CHECK-NEXT: [[XOR_I193:%.*]] = xor i32 [[ADD_I192]], [[MUL_I191]] +; CHECK-NEXT: [[SHR_I194:%.*]] = lshr i32 [[SUB106]], 15 +; CHECK-NEXT: [[AND_I195:%.*]] = and i32 [[SHR_I194]], 65537 +; CHECK-NEXT: [[MUL_I196:%.*]] = mul nuw i32 [[AND_I195]], 65535 +; CHECK-NEXT: [[ADD_I197:%.*]] = add i32 [[MUL_I196]], [[SUB106]] +; CHECK-NEXT: [[XOR_I198:%.*]] = xor i32 [[ADD_I197]], [[MUL_I196]] +; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I188]], [[XOR_I]] +; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I193]] +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I198]] +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I188_1]], [[ADD113]] +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I193_1]] +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I198_1]] +; CHECK-NEXT: [[TMP150:%.*]] = add nsw <2 x i32> [[TMP149]], [[TMP146]] +; CHECK-NEXT: [[TMP151:%.*]] = sub nsw <2 x i32> [[TMP146]], [[TMP149]] +; CHECK-NEXT: [[TMP152:%.*]] = extractelement <2 x i32> [[TMP150]], i64 0 +; CHECK-NEXT: [[TMP153:%.*]] = extractelement <2 x i32> [[TMP150]], i64 1 +; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 [[TMP153]], [[TMP152]] +; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[TMP152]], [[TMP153]] +; CHECK-NEXT: [[TMP154:%.*]] = extractelement <2 x i32> [[TMP151]], i64 0 +; CHECK-NEXT: [[TMP155:%.*]] = extractelement <2 x i32> [[TMP151]], i64 1 +; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 [[TMP155]], [[TMP154]] +; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 [[TMP154]], [[TMP155]] +; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 +; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 +; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 +; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] +; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] +; CHECK-NEXT: [[SHR_I184_2:%.*]] = lshr i32 [[ADD105_2]], 15 +; CHECK-NEXT: [[AND_I185_2:%.*]] = and i32 [[SHR_I184_2]], 65537 +; CHECK-NEXT: [[MUL_I186_2:%.*]] = mul nuw i32 [[AND_I185_2]], 65535 +; CHECK-NEXT: [[ADD_I187_2:%.*]] = add i32 [[MUL_I186_2]], [[ADD105_2]] +; CHECK-NEXT: [[XOR_I188_2:%.*]] = xor i32 [[ADD_I187_2]], [[MUL_I186_2]] +; CHECK-NEXT: [[SHR_I189_2:%.*]] = lshr i32 [[SUB104_2]], 15 +; CHECK-NEXT: [[AND_I190_2:%.*]] = and i32 [[SHR_I189_2]], 65537 +; CHECK-NEXT: [[MUL_I191_2:%.*]] = mul nuw i32 [[AND_I190_2]], 65535 +; CHECK-NEXT: [[ADD_I192_2:%.*]] = add i32 [[MUL_I191_2]], [[SUB104_2]] +; CHECK-NEXT: [[XOR_I193_2:%.*]] = xor i32 [[ADD_I192_2]], [[MUL_I191_2]] +; CHECK-NEXT: [[SHR_I194_2:%.*]] = lshr i32 [[SUB106_2]], 15 +; CHECK-NEXT: [[AND_I195_2:%.*]] = and i32 [[SHR_I194_2]], 65537 +; CHECK-NEXT: [[MUL_I196_2:%.*]] = mul nuw i32 [[AND_I195_2]], 65535 +; CHECK-NEXT: [[ADD_I197_2:%.*]] = add i32 [[MUL_I196_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I198_2:%.*]] = xor i32 [[ADD_I197_2]], [[MUL_I196_2]] +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I188_2]], [[ADD113_1]] +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I193_2]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I198_2]] +; CHECK-NEXT: [[TMP156:%.*]] = sub nsw <2 x i32> [[TMP96]], [[TMP95]] +; CHECK-NEXT: [[TMP157:%.*]] = sub nsw <2 x i32> [[TMP133]], [[TMP132]] +; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i64 1 +; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i64 1 +; CHECK-NEXT: [[ADD78_3:%.*]] = add nsw i32 [[TMP159]], [[TMP158]] +; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP156]], i64 0 +; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP157]], i64 0 +; CHECK-NEXT: [[ADD94_3:%.*]] = add nsw i32 [[TMP161]], [[TMP160]] +; CHECK-NEXT: [[TMP162:%.*]] = sub nsw <2 x i32> [[TMP156]], [[TMP157]] +; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 [[ADD94_3]], [[ADD78_3]] +; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 [[ADD78_3]], [[ADD94_3]] +; CHECK-NEXT: [[TMP163:%.*]] = extractelement <2 x i32> [[TMP162]], i64 0 +; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x i32> [[TMP162]], i64 1 +; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[TMP163]], [[TMP164]] +; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 [[TMP164]], [[TMP163]] +; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 +; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 +; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 +; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] +; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] +; CHECK-NEXT: [[SHR_I184_3:%.*]] = lshr i32 [[ADD105_3]], 15 +; CHECK-NEXT: [[AND_I185_3:%.*]] = and i32 [[SHR_I184_3]], 65537 +; CHECK-NEXT: [[MUL_I186_3:%.*]] = mul nuw i32 [[AND_I185_3]], 65535 +; CHECK-NEXT: [[ADD_I187_3:%.*]] = add i32 [[MUL_I186_3]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I188_3:%.*]] = xor i32 [[ADD_I187_3]], [[MUL_I186_3]] +; CHECK-NEXT: [[SHR_I189_3:%.*]] = lshr i32 [[SUB104_3]], 15 +; CHECK-NEXT: [[AND_I190_3:%.*]] = and i32 [[SHR_I189_3]], 65537 +; CHECK-NEXT: [[MUL_I191_3:%.*]] = mul nuw i32 [[AND_I190_3]], 65535 +; CHECK-NEXT: [[ADD_I192_3:%.*]] = add i32 [[MUL_I191_3]], [[SUB104_3]] +; CHECK-NEXT: [[XOR_I193_3:%.*]] = xor i32 [[ADD_I192_3]], [[MUL_I191_3]] +; CHECK-NEXT: [[SHR_I194_3:%.*]] = lshr i32 [[SUB106_3]], 15 +; CHECK-NEXT: [[AND_I195_3:%.*]] = and i32 [[SHR_I194_3]], 65537 +; CHECK-NEXT: [[MUL_I196_3:%.*]] = mul nuw i32 [[AND_I195_3]], 65535 +; CHECK-NEXT: [[ADD_I197_3:%.*]] = add i32 [[MUL_I196_3]], [[SUB106_3]] +; CHECK-NEXT: [[XOR_I198_3:%.*]] = xor i32 [[ADD_I197_3]], [[MUL_I196_3]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I188_3]], [[ADD113_2]] +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I193_3]] +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I198_3]] +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[ADD113_3]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll @@ -12,52 +12,52 @@ ; CHECK-LABEL: @wrap_mul4( ; CHECK-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[TEMP:%.*]] = load double, double* [[ARRAYIDX1_I]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TEMP]], i32 1 ; CHECK-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 0, i64 1 ; CHECK-NEXT: [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP2]], i32 1 ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2 ; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2 ; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0 ; CHECK-NEXT: [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[TEMP10]], i32 1 ; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1 ; CHECK-NEXT: [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TEMP11]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x double> [[TMP2]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>* ; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[TMP4]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[TMP16]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP9]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <2 x double> [[TMP15]], [[TMP18]] -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[TMP20]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x double>, <2 x double>* [[TMP17]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = fmul <2 x double> [[TMP2]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* +; CHECK-NEXT: [[TMP21:%.*]] = load <2 x double>, <2 x double>* [[TMP20]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP4]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[TMP19]], [[TMP22]] +; CHECK-NEXT: store <2 x double> [[TMP15]], <2 x double>* [[TMP16]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP23]], <2 x double>* [[TMP24]], align 8 ; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[TEMP10]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP2]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x double> [[TMP24]], double [[TEMP11]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP7]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP23]], [[TMP26]] +; CHECK-NEXT: [[TMP25:%.*]] = fmul <2 x double> [[TMP10]], [[TMP6]] +; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP8]] +; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP25]], [[TMP26]] ; CHECK-NEXT: [[TMP28:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP27]], <2 x double>* [[TMP28]], align 8 ; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6 -; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[TMP14]], [[TMP22]] -; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[TMP17]], [[TMP25]] +; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[TMP18]], [[TMP6]] +; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[TMP21]], [[TMP8]] ; CHECK-NEXT: [[TMP31:%.*]] = fadd <2 x double> [[TMP29]], [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP31]], <2 x double>* [[TMP32]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll @@ -3,54 +3,6 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" -define void @foo(float* noalias %a, float* noalias %b, float* noalias %c) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !nontemporal !0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[C:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[A:%.*]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4, !nontemporal !0 -; CHECK-NEXT: ret void -; -entry: -; Check that we don't lose !nontemporal hint when vectorizing loads. - %b1 = load float, float* %b, align 4, !nontemporal !0 - %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1 - %b2 = load float, float* %arrayidx.1, align 4, !nontemporal !0 - %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2 - %b3 = load float, float* %arrayidx.2, align 4, !nontemporal !0 - %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3 - %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0 - -; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it. - %c1 = load float, float* %c, align 4 - %arrayidx2.1 = getelementptr inbounds float, float* %c, i64 1 - %c2 = load float, float* %arrayidx2.1, align 4 - %arrayidx2.2 = getelementptr inbounds float, float* %c, i64 2 - %c3 = load float, float* %arrayidx2.2, align 4 - %arrayidx2.3 = getelementptr inbounds float, float* %c, i64 3 - %c4 = load float, float* %arrayidx2.3, align 4 - - %a1 = fadd float %b1, %c1 - %a2 = fadd float %b2, %c2 - %a3 = fadd float %b3, %c3 - %a4 = fadd float %b4, %c4 - -; Check that we don't lose !nontemporal hint when vectorizing stores. - store float %a1, float* %a, align 4, !nontemporal !0 - %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1 - store float %a2, float* %arrayidx3.1, align 4, !nontemporal !0 - %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2 - store float %a3, float* %arrayidx3.2, align 4, !nontemporal !0 - %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3 - store float %a4, float* %arrayidx3.3, align 4, !nontemporal !0 - - ret void -} - define void @foo2(float* noalias %a, float* noalias %b) { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll @@ -9,7 +9,7 @@ %add1 = fadd double %f1, %f1 %w0 = getelementptr inbounds double, double* %w, i64 0 %w1 = getelementptr inbounds double, double* %w, i64 1 -; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3 +; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 5 store double %add0, double* %w0, !dbg !9 store double %add1, double* %w1 ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll @@ -13,13 +13,12 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[CALL_I_I:%.*]] = call i32* @get_ptr() -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[GEP_1]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[RES:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll @@ -12,16 +12,16 @@ ; CHECK-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0 ; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1 ; CHECK-NEXT: [[LD_2_0:%.*]] = load double, double* [[GEP_2_0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD_2_0]], i32 1 ; CHECK-NEXT: [[LD_2_1:%.*]] = load double, double* [[GEP_2_1]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: ret void @@ -59,16 +59,16 @@ ; CHECK-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds float, float* [[ARRAY2:%.*]], i64 0 ; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds float, float* [[ARRAY2]], i64 1 ; CHECK-NEXT: [[LD_2_0:%.*]] = load float, float* [[GEP_2_0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[LD_2_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[LD_2_0]], i32 1 ; CHECK-NEXT: [[LD_2_1:%.*]] = load float, float* [[GEP_2_1]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[GEP_1_0]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD_2_1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[LD_2_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[GEP_1_0]] to <2 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[GEP_1_0]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[TMP8]], <2 x float>* [[TMP9]], align 4 ; CHECK-NEXT: ret void @@ -106,16 +106,16 @@ ; CHECK-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds i64, i64* [[ARRAY2:%.*]], i64 0 ; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds i64, i64* [[ARRAY2]], i64 1 ; CHECK-NEXT: [[LD_2_0:%.*]] = load i64, i64* [[GEP_2_0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[LD_2_0]], i32 1 ; CHECK-NEXT: [[LD_2_1:%.*]] = load i64, i64* [[GEP_2_1]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[GEP_1_0]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i64> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[LD_2_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[GEP_1_0]] to <2 x i64>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i64> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[GEP_1_0]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* [[TMP9]], align 4 ; CHECK-NEXT: ret void @@ -153,16 +153,16 @@ ; CHECK-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds i32, i32* [[ARRAY2:%.*]], i64 0 ; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds i32, i32* [[ARRAY2]], i64 1 ; CHECK-NEXT: [[LD_2_0:%.*]] = load i32, i32* [[GEP_2_0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[LD_2_0]], i32 1 ; CHECK-NEXT: [[LD_2_1:%.*]] = load i32, i32* [[GEP_2_1]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[GEP_1_0]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i32> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LD_2_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[GEP_1_0]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i32> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i32> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[GEP_1_0]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[TMP8]], <2 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -134,8 +134,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -134,8 +134,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -18,24 +18,17 @@ define void @s116_modified(float* %a) { ; CHECK-LABEL: @s116_modified( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 ; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 4 -; CHECK-NEXT: [[LD1:%.*]] = load float, float* [[GEP1]], align 4 -; CHECK-NEXT: [[LD0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP2]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[LD4:%.*]] = load float, float* [[GEP4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[LD4]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[LD1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[LD1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[GEP0]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[LD4]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[SHUFFLE]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP6]], <4 x float>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds float, float* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -13,16 +13,13 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[V2_LANE_3:%.*]] = extractelement <4 x double> [[V_2]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: call void @use(double [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP4]]) -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -54,17 +51,14 @@ ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V_3:%.*]] = load <2 x double>, <2 x double>* [[PTR_3:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x double> [[V_1]], <2 x double> [[V_3]], <2 x i32> ; CHECK-NEXT: [[V3_LANE_1:%.*]] = extractelement <2 x double> [[V_3]], i32 1 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V3_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V3_LANE_1]]) -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -95,19 +89,16 @@ ; CHECK-LABEL: @noop_extract_second_2_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, <4 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <4 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -137,16 +128,14 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: call void @use(double [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP5]]) -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -175,19 +164,16 @@ ; CHECK-LABEL: @extract_lanes_1_and_2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, <4 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <4 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) -; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -218,28 +204,22 @@ ; CHECK-LABEL: @noop_extracts_existing_vector_4_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_1]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[V2_LANE_0]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP7]], <9 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP3]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -274,29 +254,20 @@ ; CHECK-LABEL: @extracts_jumbled_4_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 -; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 -; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_1]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_3]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[V2_LANE_2]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[V2_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP9]], <9 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP3]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -333,51 +304,22 @@ ; CHECK-LABEL: @noop_extracts_9_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 -; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 -; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 -; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 -; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5 -; CHECK-NEXT: [[V1_LANE_6:%.*]] = extractelement <9 x double> [[V_1]], i32 6 -; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7 -; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 -; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_3]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_4]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_7]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_8]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_0]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_1]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = fmul <8 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_1]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_2]], i32 5 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_3]], i32 6 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_4]], i32 7 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V2_LANE_0]], i32 2 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP5]], double [[A_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP25]], double [[B_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]] ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void @@ -450,47 +392,22 @@ ; CHECK-LABEL: @first_mul_chain_jumbled( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 -; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 -; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 -; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 -; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5 -; CHECK-NEXT: [[V1_LANE_6:%.*]] = extractelement <9 x double> [[V_1]], i32 6 -; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7 -; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 -; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_6]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_5]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_0]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_2]], i32 2 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul <8 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_1]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_2]], i32 5 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_3]], i32 6 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_4]], i32 7 -; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP4]], double [[A_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <8 x double> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP22]], double [[B_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP6]], double [[B_LANE_8]], i32 8 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]] ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void @@ -563,51 +480,23 @@ ; CHECK-LABEL: @first_and_second_mul_chain_jumbled( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8 -; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0 -; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2 -; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3 ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4 -; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5 -; CHECK-NEXT: [[V1_LANE_6:%.*]] = extractelement <9 x double> [[V_1]], i32 6 -; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7 -; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 -; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = fmul <8 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_7]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_6]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_1]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_0]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_3]], i32 5 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_2]], i32 6 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_5]], i32 7 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V2_LANE_0]], i32 2 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP5]], double [[A_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]] -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <9 x i32> -; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP25]], double [[B_LANE_8]], i32 8 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> +; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]] ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -244,8 +244,8 @@ ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) ; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> @@ -291,11 +291,11 @@ ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) ; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -244,8 +244,8 @@ ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) ; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> @@ -291,11 +291,11 @@ ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) ; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll @@ -4,16 +4,10 @@ define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) { ; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7 -; CHECK-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8 -; CHECK-NEXT: [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7 -; CHECK-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[ARG0_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 [[ARG0_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[ARG1_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[ARG1_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP3]]) -; CHECK-NEXT: ret <2 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x i16> [[ARG0:%.*]], <9 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; CHECK-NEXT: ret <2 x i16> [[TMP2]] ; bb: %arg0.1 = extractelement <9 x i16> undef, i64 7 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -193,13 +193,13 @@ ; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]]) ; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1 ; GCN-NEXT: [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2 -; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 -; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* +; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x half> [[TMP1]], half [[I4]], i32 1 +; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 -; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]]) +; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* +; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2 +; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP6]]) ; GCN-NEXT: [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)* ; GCN-NEXT: store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2 ; GCN-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -10,12 +10,12 @@ ; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ADD277]], i32 1 ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -58,7 +58,7 @@ ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST:%.*]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 ; CHECK-NEXT: store i64 [[TMP4]], i64* [[LD]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll @@ -6,20 +6,21 @@ define { i64, i64 } @patatino(double %arg) { ; CHECK-LABEL: @patatino( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* bitcast ([6 x double]* @global to <4 x double>*), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[ARG]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = fptosi <2 x double> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sext <2 x i32> [[TMP9]] to <2 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 +; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1 +; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1 ; CHECK-NEXT: ret { i64, i64 } [[T17]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll @@ -4,14 +4,6 @@ define void @_Z10fooConvertPDv4_xS0_S0_PKS_() { ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[VECINS_I_5_I1:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll @@ -4,14 +4,6 @@ define void @_Z10fooConvertPDv4_xS0_S0_PKS_() { ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[VECINS_I_5_I1:%.*]] = shufflevector <8 x i32> undef, <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -6,26 +6,26 @@ ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP14:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE2]], +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE1]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX3]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX4]], i32 0 +; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = and i32 [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = and i32 [[OP_RDX4]], [[OP_RDX5]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX6]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TMP14]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> @@ -34,26 +34,26 @@ ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; FORCE_REDUCTION-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; FORCE_REDUCTION-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; FORCE_REDUCTION-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) +; FORCE_REDUCTION-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE2]], +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE]]) +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE1]]) ; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] ; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] -; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP4]] -; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX1]], [[OP_RDX4]] +; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX]], [[TMP8]] +; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[TMP0]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[TMP0]], [[TMP4]] +; FORCE_REDUCTION-NEXT: [[OP_RDX6:%.*]] = and i32 [[OP_RDX4]], [[OP_RDX5]] +; FORCE_REDUCTION-NEXT: [[OP_RDX7:%.*]] = and i32 [[OP_RDX3]], [[OP_RDX6]] ; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP4]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX7]], i32 0 ; FORCE_REDUCTION-NEXT: [[TMP10]] = insertelement <2 x i32> [[TMP9]], i32 [[VAL_43]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -336,24 +336,14 @@ define void @vec_shuff_reorder() #0 { ; CHECK-LABEL: @vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -11,12 +11,12 @@ ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -32,12 +32,12 @@ ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -53,12 +53,12 @@ ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -11,12 +11,12 @@ ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -32,12 +32,12 @@ ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> @@ -53,12 +53,12 @@ ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -163,7 +163,7 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: @sitofp_4i32_8i16( ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> ; CHECK-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R71]] @@ -201,11 +201,11 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -163,7 +163,7 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: @sitofp_4i32_8i16( ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> ; CHECK-NEXT: [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R71]] @@ -201,11 +201,11 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll @@ -7,15 +7,14 @@ ; CHECK-NEXT: [[CALL:%.*]] = load i16, i16* undef, align 2 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> , i16 [[CALL37]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CALL37]], i32 6 -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i16> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16> -; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i16 [[TMP8]], 0 -; CHECK-NEXT: ret i16 [[OP_EXTRA]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i16> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i16> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i1> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP7]], 0 +; CHECK-NEXT: ret i16 [[OP_RDX]] ; entry: %call = load i16, i16* undef, align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -98,7 +98,7 @@ ; SLM-LABEL: @fmul_fdiv_v4f32_const( ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -98,7 +98,7 @@ ; SLM-LABEL: @fmul_fdiv_v4f32_const( ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> ; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -170,9 +170,9 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], ; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R71]] @@ -234,8 +234,8 @@ ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> @@ -251,13 +251,13 @@ ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -265,13 +265,13 @@ ; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -279,13 +279,13 @@ ; AVX1-NEXT: ret <8 x i32> [[R71]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -293,13 +293,13 @@ ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -441,10 +441,10 @@ ; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> @@ -458,10 +458,10 @@ ; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -170,9 +170,9 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], ; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R71]] @@ -234,8 +234,8 @@ ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> @@ -251,13 +251,13 @@ ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SLM-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -265,13 +265,13 @@ ; SLM-NEXT: ret <8 x i32> [[R71]] ; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX1-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -279,13 +279,13 @@ ; AVX1-NEXT: ret <8 x i32> [[R71]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -293,13 +293,13 @@ ; AVX2-NEXT: ret <8 x i32> [[R71]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]] ; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> @@ -441,10 +441,10 @@ ; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 ; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> @@ -458,10 +458,10 @@ ; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 ; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 ; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], ; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> ; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], ; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i64 1 ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll @@ -607,49 +607,25 @@ ; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3 -; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4 -; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 -; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 -; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 -; SLM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; SLM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1 -; SLM-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1 -; SLM-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; SLM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1 -; SLM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 0 -; SLM-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1 -; SLM-NEXT: [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]] -; SLM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[A4]], i32 0 -; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1 -; SLM-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[B4]], i32 0 -; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1 -; SLM-NEXT: [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]] -; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[A6]], i32 0 -; SLM-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1 -; SLM-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[B6]], i32 0 -; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1 -; SLM-NEXT: [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]] -; SLM-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> -; SLM-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP23]], <8 x i32> -; SLM-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP24]], <8 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] +; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> +; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> +; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> ; SLM-NEXT: ret <8 x double> [[R73]] ; ; AVX-LABEL: @buildvector_div_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll @@ -607,49 +607,25 @@ ; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3 -; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4 -; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 -; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 -; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 -; SLM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; SLM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1 -; SLM-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1 -; SLM-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]] -; SLM-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; SLM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1 -; SLM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 0 -; SLM-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1 -; SLM-NEXT: [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]] -; SLM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[A4]], i32 0 -; SLM-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1 -; SLM-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[B4]], i32 0 -; SLM-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1 -; SLM-NEXT: [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]] -; SLM-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[A6]], i32 0 -; SLM-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1 -; SLM-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[B6]], i32 0 -; SLM-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1 -; SLM-NEXT: [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]] -; SLM-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> -; SLM-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP23]], <8 x i32> -; SLM-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP24]], <8 x i32> +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] +; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> +; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> +; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> ; SLM-NEXT: ret <8 x double> [[R73]] ; ; AVX-LABEL: @buildvector_div_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll @@ -3,27 +3,26 @@ define void @test(i32* %0, i32* %1, i32* %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> , [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP2:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i32> , [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP10]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP11]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = sub <4 x i32> [[TMP12]], +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sub <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP2:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4 ; CHECK-NEXT: ret void ; %4 = load i32, i32* %1, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll @@ -16,12 +16,12 @@ ; CHECK-NEXT: [[A0:%.*]] = load i64, i64* [[A:%.*]], align 8 ; CHECK-NEXT: [[B0:%.*]] = load i64, i64* [[B:%.*]], align 8 ; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1 -; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[IDXS0]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8 @@ -63,8 +63,6 @@ ; CHECK-LABEL: @bcast_vals2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A0:%.*]] = load i16, i16* [[A:%.*]], align 8 -; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 ; CHECK-NEXT: [[B0:%.*]] = load i16, i16* [[B:%.*]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8 @@ -73,10 +71,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] +; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[IDXS0]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll @@ -16,9 +16,9 @@ ; CHECK-LABEL: @bcast_long( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A0:%.*]] = load i32, i32* [[A:%.*]], align 8 -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[A0]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IDXS0]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP1]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll @@ -3,16 +3,16 @@ define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr undef, i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr undef, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0 ; CHECK-NEXT: store <2 x float> zeroinitializer, ptr null, align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: store <2 x float> zeroinitializer, ptr null, align 4 ; CHECK-NEXT: ret void ; @@ -36,17 +36,16 @@ define void @test1() { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr undef, i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr undef, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: store <2 x float> [[TMP8]], ptr null, align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP9]], ptr null, align 4 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds float, ptr undef, i32 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -63,16 +63,16 @@ ; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[TMP12]], i32 1 ; CHECK-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[CALL]], i32 1 ; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> [[TMP30]], double [[MUL88]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = fsub <2 x double> [[TMP29]], [[TMP27]] +; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP32]], [[TMP31]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 ; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,9 +16,9 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 +; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 ; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer ; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] @@ -26,9 +26,9 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( -; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> ; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 ; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer ; AVX-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll @@ -37,11 +37,11 @@ ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[C1:%.*]] = load i64, i64* [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -65,11 +65,11 @@ ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[C1:%.*]] = load i64, i64* [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -94,11 +94,11 @@ ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[C1:%.*]] = load i64, i64* [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -122,11 +122,11 @@ ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[C1:%.*]] = load i64, i64* [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -150,11 +150,11 @@ ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[C1:%.*]] = load i64, i64* [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -216,11 +216,11 @@ ; CHECK-NEXT: store i64 0, i64* [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[V2:%.*]] = load i64, i64* [[A2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 ; CHECK-NEXT: ret void @@ -251,11 +251,11 @@ ; CHECK-NEXT: store i64 0, i64* [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[V2:%.*]] = load i64, i64* [[A2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 ; CHECK-NEXT: ret void @@ -286,11 +286,11 @@ ; CHECK-NEXT: store i64 0, i64* [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() ; CHECK-NEXT: [[V2:%.*]] = load i64, i64* [[A2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 ; CHECK-NEXT: ret void @@ -323,11 +323,11 @@ ; CHECK-NEXT: store i64 [[U1]], i64* [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[V2]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[U2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 ; CHECK-NEXT: ret void @@ -361,11 +361,11 @@ ; CHECK-NEXT: store i64 [[U1]], i64* [[B:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[Y:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[U2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[B]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -15,11 +15,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -11,11 +11,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> poison, double* [[TMP0]], i32 0 ; CHECK-NEXT: [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = load double*, double** [[_M_FIRST3_I_I83]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double** [[_M_CUR2_I_I81]] to <2 x double*>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double*>, <2 x double*>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double*> [[TMP3]], <2 x double*> [[TMP1]], <2 x i32> ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]] ; CHECK: while.cond.i.preheader: ; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] @@ -24,10 +24,9 @@ ; CHECK: while.body.i: ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]] ; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: -; CHECK-NEXT: [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double*> [ [[TMP4]], [[ENTRY:%.*]] ], [ [[TMP3]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double** [[_M_CUR2_I_I]] to <2 x double*>* +; CHECK-NEXT: store <2 x double*> [[TMP5]], <2 x double*>* [[TMP6]], align 8 ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] ; CHECK: if.then.i55: ; CHECK-NEXT: br label [[WHILE_COND]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -8,16 +8,18 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP9]], undef ; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP9]], undef ; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef @@ -27,16 +29,13 @@ ; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP10]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x double> [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP8]], undef ; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB2:%.*]] ; CHECK-NEXT: ] @@ -45,7 +44,7 @@ ; CHECK: bb2: ; CHECK-NEXT: br label [[LABEL]] ; CHECK: label: -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15]], [[BB2]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB1]] ], [ [[TMP14]], [[BB2]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -93,11 +93,11 @@ define void @zot(%struct.hoge* %arg) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[TMP:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll @@ -13,14 +13,14 @@ ; CHECK: bb1: ; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P2:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP0]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP4]]) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]] @@ -28,7 +28,7 @@ ; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP5]]) ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll @@ -13,14 +13,14 @@ ; CHECK: bb1: ; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P2:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP0]], [[BB1]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP4]]) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]] @@ -28,7 +28,7 @@ ; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[TMP5]]) ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -35,12 +35,12 @@ ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> , double [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; CHECK-NEXT: unreachable @@ -115,16 +115,8 @@ ; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] ; CHECK: if.then38: ; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double undef, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> undef, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: br label [[RETURN:%.*]] ; CHECK: if.then78: ; CHECK-NEXT: br label [[RETURN]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -65,12 +65,12 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <4 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP2]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <4 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP6]], align 8 @@ -192,12 +192,12 @@ ; CHECK-LABEL: @foo4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <4 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP2]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[CONV]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <4 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x double> [[SHUFFLE]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP6]], align 8 @@ -243,11 +243,11 @@ ; CHECK-LABEL: @partial_mrg( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[N]], 4 @@ -256,10 +256,10 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[N]], 4 ; CHECK-NEXT: [[CONV12:%.*]] = sitofp i32 [[ADD]] to double -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV12]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; CHECK-NEXT: br label [[RETURN]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -16,11 +16,11 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: ret i32 0 @@ -61,14 +61,14 @@ ; CHECK-LABEL: @extr_user( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: ret i32 [[TMP5]] ; entry: @@ -99,14 +99,14 @@ ; CHECK-LABEL: @extr_user1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: ret i32 [[TMP5]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll @@ -1,15 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-2 | FileCheck %s +;FIXME: Looks like a problem with the cost of mul <4 x i32> ops, for some reason it is 6 instead of 1-2. + define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture %A) { ; CHECK-LABEL: @diamond_broadcast( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 +; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], undef +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4 ; CHECK-NEXT: ret i32 0 ; entry: @@ -59,11 +67,17 @@ ; CHECK-LABEL: @diamond_broadcast3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 +; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], undef +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[MUL20:%.*]] = mul i32 undef, [[LD]] +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll @@ -3,12 +3,9 @@ define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @g( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] -; CHECK-NEXT: ret <2 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <2 x i8> [[TMP2]] ; %x0 = extractelement <2 x i8> %x, i32 0 %y1 = extractelement <2 x i8> %y, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll @@ -3,12 +3,9 @@ define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @g( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]] -; CHECK-NEXT: ret <2 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: ret <2 x i8> [[TMP2]] ; %x0 = extractelement <2 x i8> %x, i32 0 %y1 = extractelement <2 x i8> %y, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -54,14 +54,11 @@ ; CHECK-LABEL: @fextr2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <4 x double>, <4 x double>* undef, align 32 -; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[P0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll @@ -14,13 +14,11 @@ define float @multi_uses(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @multi_uses( -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x float> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Y1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; CHECK-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -74,31 +74,27 @@ define float @f_used_twice_in_tree(<2 x float> %x) { ; CHECK-LABEL: @f_used_twice_in_tree( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 -; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X1]] -; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; CHECK-NEXT: ret float [[ADD]] ; ; THRESH1-LABEL: @f_used_twice_in_tree( -; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH1-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH1-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH1-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH1-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH1-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH1-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH1-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], [[X]] +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH1-NEXT: ret float [[ADD]] ; ; THRESH2-LABEL: @f_used_twice_in_tree( -; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 -; THRESH2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH2-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH2-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> +; THRESH2-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], [[X]] +; THRESH2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH2-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -8,30 +8,25 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[SHUFFLE13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE13]]) -; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[TMP6]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[TMP4]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX14]], [[BB1]] ], [ 0, [[BB2:%.*]] ] +; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX13]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[SHUFFLE9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE9]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP9]], 0 -; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], [[TMP2]] -; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[TMP10]], [[OP_RDX11]] -; CHECK-NEXT: ret i32 [[OP_RDX12]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX9:%.*]] = add i32 [[TMP6]], 0 +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[OP_RDX9]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[TMP7]], [[OP_RDX10]] +; CHECK-NEXT: ret i32 [[OP_RDX11]] ; CHECK: bb5: ; CHECK-NEXT: br label [[BB4:%.*]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -143,8 +143,8 @@ ; PR41892 define void @test_v4f32_v2f32_store(<4 x float> %f, float* %p){ ; CHECK-LABEL: @test_v4f32_v2f32_store( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -143,8 +143,8 @@ ; PR41892 define void @test_v4f32_v2f32_store(<4 x float> %f, float* %p){ ; CHECK-LABEL: @test_v4f32_v2f32_store( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll @@ -16,9 +16,9 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[N:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -622,16 +622,20 @@ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>* ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4 -; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX1]], [[OP_RDX2]] +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[ARRAYIDX_28]] to <2 x float>* +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP8]], <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP16]], [[TMP17]] ; CHECK-NEXT: ret float [[OP_RDX3]] ; ; THRESHOLD-LABEL: @loadadd31( @@ -646,18 +650,17 @@ ; THRESHOLD-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>* ; THRESHOLD-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0 -; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 -; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1 -; THRESHOLD-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[TMP12]], [[TMP14]] +; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP6]], [[TMP7]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0 +; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; THRESHOLD-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0 +; THRESHOLD-NEXT: [[TMP11:%.*]] = bitcast float* [[ARRAYIDX_28]] to <2 x float>* +; THRESHOLD-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4 +; THRESHOLD-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP10]], <2 x i32> +; THRESHOLD-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP8]], <2 x i32> +; THRESHOLD-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP13]] ; THRESHOLD-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 ; THRESHOLD-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 ; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP16]], [[TMP17]] @@ -890,18 +893,18 @@ ; THRESHOLD-LABEL: @extra_args_no_replace( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] -; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 1 -; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[MUL]], i32 0 -; THRESHOLD-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x float> -; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> , <2 x float> [[TMP5]], <2 x i32> +; THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 1 +; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 0 +; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* +; THRESHOLD-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> , <2 x i32> ; THRESHOLD-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[TMP5]], [[TMP6]] ; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 ; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]] +; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP4]], [[OP_RDX2]] ; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -904,27 +904,25 @@ ; ; THRESH-LABEL: @maxi8_mutiple_uses( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 -; THRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; THRESH-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> ; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; THRESH-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) -; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 -; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP3]], i32 1 -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i32 1 -; THRESH-NEXT: [[TMP13:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP12]] -; THRESH-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[TMP10]], <2 x i32> [[TMP12]] -; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0 -; THRESH-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1 -; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]] -; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP15]], i32 [[TMP16]] -; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP8]], [[OP_RDX3]] -; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP8]], i32 [[OP_RDX3]] -; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1 -; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 3, i32 4 -; THRESH-NEXT: store i32 [[TMP18]], i32* @var, align 8 +; THRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 1 +; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 +; THRESH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; THRESH-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 +; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP5]], <2 x i32> +; THRESH-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP3]], <2 x i32> +; THRESH-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP9]] +; THRESH-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP10]], <2 x i32> [[TMP9]] +; THRESH-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; THRESH-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP13]], [[TMP14]] +; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP13]], i32 [[TMP14]] +; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP7]], [[OP_RDX3]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP7]], i32 [[OP_RDX3]] +; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; THRESH-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 3, i32 4 +; THRESH-NEXT: store i32 [[TMP16]], i32* @var, align 8 ; THRESH-NEXT: ret i32 [[OP_RDX5]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -1093,23 +1091,22 @@ ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: ; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; THRESH-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; THRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 -; THRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 -; THRESH-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i1> poison, i1 [[OP_RDX]], i32 0 -; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i1> [[TMP11]], i1 [[TMP5]], i32 1 -; THRESH-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP2]], <2 x i32> -; THRESH-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP2]], <2 x i32> -; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP15]] -; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP17]], i32 [[TMP18]] -; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP8]], [[OP_RDX3]] -; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP8]], i32 [[OP_RDX3]] +; THRESH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; THRESH-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 +; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP2]], <2 x i32> +; THRESH-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP2]], <2 x i32> +; THRESH-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0 +; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1 +; THRESH-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> poison, i1 [[OP_RDX]], i32 0 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1 +; THRESH-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP10]], <2 x i32> [[TMP9]] +; THRESH-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 +; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 +; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP16]], [[TMP17]] +; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP16]], i32 [[TMP17]] +; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP7]], [[OP_RDX3]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP7]], i32 [[OP_RDX3]] ; THRESH-NEXT: ret i32 [[OP_RDX5]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-const-undef.ll @@ -3,22 +3,13 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @simple_select( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP3]], <2 x float> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP6]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -68,6 +68,8 @@ ; THRESHOLD-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 ; THRESHOLD-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2 ; THRESHOLD-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; THRESHOLD-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; THRESHOLD-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 ; THRESHOLD-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 @@ -144,18 +146,16 @@ ; MINTREESIZE-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 ; MINTREESIZE-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2 ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) @@ -275,37 +275,19 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP3]], <2 x float> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[RD1]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -431,20 +413,18 @@ ; must be rescheduled. The case here is from compiling Julia. define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { ; THRESHOLD-LABEL: @reschedule_extract( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; THRESHOLD-NEXT: ret <4 x float> [[TMP2]] ; ; NOTHRESHOLD-LABEL: @reschedule_extract( ; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @reschedule_extract( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP2]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -469,20 +449,18 @@ ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ; THRESHOLD-LABEL: @take_credit( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; THRESHOLD-NEXT: ret <4 x float> [[TMP2]] ; ; NOTHRESHOLD-LABEL: @take_credit( ; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @take_credit( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP2]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -531,20 +509,18 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 { ; THRESHOLD-LABEL: @_vadd256( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <8 x float> [[TMP1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]] +; THRESHOLD-NEXT: ret <8 x float> [[TMP2]] ; ; NOTHRESHOLD-LABEL: @_vadd256( ; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] ; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @_vadd256( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <8 x float> [[TMP5]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <8 x float> [[TMP2]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -103,6 +103,8 @@ ; THRESHOLD-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 ; THRESHOLD-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2 ; THRESHOLD-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; THRESHOLD-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; THRESHOLD-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 ; THRESHOLD-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 @@ -179,18 +181,16 @@ ; MINTREESIZE-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1 ; MINTREESIZE-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2 ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) @@ -310,37 +310,19 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP3]], <2 x float> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[RD1]] ; %c0 = extractelement <4 x i32> %c, i32 0 @@ -466,20 +448,18 @@ ; must be rescheduled. The case here is from compiling Julia. define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { ; THRESHOLD-LABEL: @reschedule_extract( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; THRESHOLD-NEXT: ret <4 x float> [[TMP2]] ; ; NOTHRESHOLD-LABEL: @reschedule_extract( ; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @reschedule_extract( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP2]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -504,20 +484,18 @@ ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ; THRESHOLD-LABEL: @take_credit( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <4 x float> [[TMP1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; THRESHOLD-NEXT: ret <4 x float> [[TMP2]] ; ; NOTHRESHOLD-LABEL: @take_credit( ; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] ; NOTHRESHOLD-NEXT: ret <4 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @take_credit( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 3 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <4 x float> [[TMP5]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP2]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -566,20 +544,18 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 { ; THRESHOLD-LABEL: @_vadd256( -; THRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; THRESHOLD-NEXT: ret <8 x float> [[TMP1]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <2 x i32> +; THRESHOLD-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]] +; THRESHOLD-NEXT: ret <8 x float> [[TMP2]] ; ; NOTHRESHOLD-LABEL: @_vadd256( ; NOTHRESHOLD-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] ; NOTHRESHOLD-NEXT: ret <8 x float> [[TMP1]] ; ; MINTREESIZE-LABEL: @_vadd256( -; MINTREESIZE-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[B:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[A:%.*]], i32 7 -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = fadd <8 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: ret <8 x float> [[TMP5]] +; MINTREESIZE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <2 x i32> +; MINTREESIZE-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <8 x float> [[TMP2]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -9,12 +9,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* undef, align 4 ; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_SW:%.*]], %struct.sw* [[V:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* undef, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP1]], i32 1 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[X]] to <2 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 16 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], undef ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: [[I1771:%.*]] = getelementptr inbounds double, double* [[P2:%.*]], i64 54 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1754:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1778:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1754]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[I1771]] to <2 x double>* @@ -15,12 +15,12 @@ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1781]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[I1792]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> , double [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x double> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> [[TMP9]], double [[I1792]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> , double [[TMP12]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x double> [[TMP11]], [[TMP13]] ; CHECK-NEXT: ret <4 x double> [[TMP14]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -8,9 +8,9 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -18,30 +18,30 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP1]] to float -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> , [[TMP8]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 -; CHECK-NEXT: store float [[TMP10]], float* @g, align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2 -; CHECK-NEXT: store float [[TMP12]], float* @c, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0 -; CHECK-NEXT: store float [[TMP13]], float* @d, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3 -; CHECK-NEXT: store float [[TMP14]], float* @e, align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 -; CHECK-NEXT: store float [[TMP15]], float* @f, align 4 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[SHUFFLE]], <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fsub <4 x float> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x float> +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = fsub <2 x float> , [[TMP9]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[SHUFFLE]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 +; CHECK-NEXT: store float [[TMP12]], float* @g, align 4 +; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP13]], i32 2 +; CHECK-NEXT: store float [[TMP14]], float* @c, align 4 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0 +; CHECK-NEXT: store float [[TMP15]], float* @d, align 4 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 3 +; CHECK-NEXT: store float [[TMP16]], float* @e, align 4 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 1 +; CHECK-NEXT: store float [[TMP17]], float* @f, align 4 +; CHECK-NEXT: [[TMP18:%.*]] = fsub <4 x float> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP13]], [[TMP11]] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -55,11 +55,12 @@ ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 -; CHECK-NEXT: ret <4 x float> [[I3]] +; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[X2]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[SHUFFLE]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -55,11 +55,12 @@ ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 -; CHECK-NEXT: ret <4 x float> [[I3]] +; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[X2]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[SHUFFLE]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll @@ -7,42 +7,31 @@ define <2 x i64> @load_00123456(ptr nocapture noundef readonly %data) { ; SSE-LABEL: @load_00123456( -; SSE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 1 -; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 2 -; SSE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3 -; SSE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 4 -; SSE-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 5 +; SSE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 4 ; SSE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 6 -; SSE-NEXT: [[T0:%.*]] = load i16, ptr [[DATA]], align 2 -; SSE-NEXT: [[T1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2 -; SSE-NEXT: [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 -; SSE-NEXT: [[T3:%.*]] = load i16, ptr [[ARRAYIDX3]], align 2 -; SSE-NEXT: [[T4:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2 -; SSE-NEXT: [[T5:%.*]] = load i16, ptr [[ARRAYIDX5]], align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[DATA]], align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[ARRAYIDX4]], align 2 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP3]], <8 x i32> ; SSE-NEXT: [[T6:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2 -; SSE-NEXT: [[VECINIT0_I_I:%.*]] = insertelement <8 x i16> undef, i16 [[T0]], i64 0 -; SSE-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <8 x i16> [[VECINIT0_I_I]], i16 [[T0]], i64 1 -; SSE-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I_I]], i16 [[T1]], i64 2 -; SSE-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I]], i16 [[T2]], i64 3 -; SSE-NEXT: [[VECINIT4_I_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I_I]], i16 [[T3]], i64 4 -; SSE-NEXT: [[VECINIT5_I_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I_I]], i16 [[T4]], i64 5 -; SSE-NEXT: [[VECINIT6_I_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I_I]], i16 [[T5]], i64 6 -; SSE-NEXT: [[VECINIT7_I_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I_I]], i16 [[T6]], i64 7 -; SSE-NEXT: [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I]] to <2 x i64> +; SSE-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> poison, i16 [[T6]], i32 0 +; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[SHUFFLE]], <8 x i32> +; SSE-NEXT: [[T7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64> ; SSE-NEXT: ret <2 x i64> [[T7]] ; ; AVX-LABEL: @load_00123456( -; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 2 -; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[DATA]], align 2 -; AVX-NEXT: [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX3]], align 2 -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> -; AVX-NEXT: [[VECINIT2_I_I2:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> -; AVX-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I2]], i16 [[T2]], i64 3 -; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> -; AVX-NEXT: [[VECINIT7_I_I1:%.*]] = shufflevector <8 x i16> [[VECINIT3_I_I]], <8 x i16> [[TMP4]], <8 x i32> -; AVX-NEXT: [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I1]] to <2 x i64> +; AVX-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 4 +; AVX-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 6 +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[DATA]], align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[ARRAYIDX4]], align 2 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP3]], <8 x i32> +; AVX-NEXT: [[T6:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> poison, i16 [[T6]], i32 0 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[SHUFFLE]], <8 x i32> +; AVX-NEXT: [[T7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64> ; AVX-NEXT: ret <2 x i64> [[T7]] ; %arrayidx1 = getelementptr inbounds i16, ptr %data, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -21,22 +21,17 @@ ; CHECK-LABEL: @lookahead_basic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 -; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 -; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 -; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <8 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -88,17 +83,16 @@ ; CHECK-LABEL: @lookahead_alt1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 -; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 @@ -149,26 +143,21 @@ ; CHECK-LABEL: @lookahead_alt2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 -; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 -; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 -; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <8 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -226,29 +215,28 @@ ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 -; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 -; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 1 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A2]], i32 1 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -316,33 +304,33 @@ ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 -; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 -; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 1 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8 -; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 -; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A2]], i32 1 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: store double [[TMP15]], double* [[EXT4:%.*]], align 8 +; CHECK-NEXT: store double [[TMP15]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -435,24 +423,40 @@ ; This checks that we choose to group consecutive extracts from the same vectors. define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2) { -; CHECK-LABEL: @ChecksExtractScores( -; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 -; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 -; CHECK-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 -; CHECK-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 -; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 -; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 -; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 -; CHECK-NEXT: ret void +; SSE-LABEL: @ChecksExtractScores( +; SSE-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 +; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 +; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 +; SSE-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 +; SSE-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP3]] +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ChecksExtractScores( +; AVX-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 +; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 +; AVX-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 +; AVX-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[LOADA1]], i32 1 +; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 +; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 +; AVX-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] +; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] +; AVX-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; AVX-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; AVX-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 %idx1 = getelementptr inbounds double, double* %array, i64 1 @@ -486,11 +490,11 @@ ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[VECEXT_I276_I169]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[C:%.*]], i32 1 +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]] ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], @@ -534,11 +538,11 @@ ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[VECEXT_I276_I169]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[C:%.*]], i32 1 +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]] ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], @@ -582,11 +586,11 @@ ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[VECEXT_I276_I169]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[C:%.*]], i32 1 +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]] ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], @@ -631,54 +635,42 @@ ; SSE-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 ; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 ; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 -; SSE-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 -; SSE-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 ; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> ; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 -; SSE-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 -; SSE-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> ; SSE-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; SSE-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4 +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] ; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 -; SSE-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]] -; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; SSE-NEXT: ret void ; ; AVX-LABEL: @ChecksExtractScores_different_vectors( ; AVX-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 ; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 ; AVX-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 ; AVX-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[LOADA1]], i32 1 ; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 ; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 -; AVX-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 -; AVX-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> ; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 ; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 -; AVX-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 -; AVX-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> ; AVX-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[EXTRA1]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[LOADA0]], i32 1 -; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[LOADA1]], i32 1 -; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] -; AVX-NEXT: [[TMP12:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; AVX-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 +; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP5]], [[TMP2]] +; AVX-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP6]], [[TMP4]] +; AVX-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP7]], [[TMP8]] +; AVX-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; AVX-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; AVX-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 @@ -720,13 +712,13 @@ ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; SSE-NEXT: [[TMP2:%.*]] = bitcast double* [[GEP_2_0]] to <2 x double>* ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]] ; SSE-NEXT: ret double [[ADD3]] ; ; AVX-LABEL: @splat_loads( @@ -735,16 +727,16 @@ ; AVX-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0 ; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1 ; AVX-NEXT: [[LD_2_0:%.*]] = load double, double* [[GEP_2_0]], align 8 +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD_2_0]], i32 1 ; AVX-NEXT: [[LD_2_1:%.*]] = load double, double* [[GEP_2_1]], align 8 -; AVX-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* +; AVX-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]] +; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP5]], [[TMP3]] +; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], [[TMP7]] ; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 ; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 ; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP9]], [[TMP10]] @@ -785,15 +777,15 @@ ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; SSE-NEXT: [[TMP2:%.*]] = bitcast double* [[GEP_2_0]] to <2 x double>* ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer -; SSE-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP6]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = fsub <2 x double> [[TMP8]], [[TMP4]] +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 +; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP10]], [[TMP11]] ; SSE-NEXT: ret double [[RES]] ; ; AVX-LABEL: @splat_loads_with_internal_uses( @@ -802,17 +794,17 @@ ; AVX-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0 ; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1 ; AVX-NEXT: [[LD_2_0:%.*]] = load double, double* [[GEP_2_0]], align 8 +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD_2_0]], i32 1 ; AVX-NEXT: [[LD_2_1:%.*]] = load double, double* [[GEP_2_1]], align 8 -; AVX-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fsub <2 x double> [[TMP8]], [[TMP3]] +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* +; AVX-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]] +; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP5]], [[TMP3]] +; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], [[TMP7]] +; AVX-NEXT: [[TMP9:%.*]] = fsub <2 x double> [[TMP8]], [[TMP1]] ; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 ; AVX-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 ; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP10]], [[TMP11]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll @@ -96,67 +96,28 @@ ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ undef, [[BB1]] ], [ poison, [[BB2:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[TMP1]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[TMP1]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[TMP1]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[TMP1]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP1]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP1]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP1]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP1]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP1]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP1]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP1]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP1]], i32 15 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <32 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <32 x i32> [[TMP18]], i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <32 x i32> [[TMP19]], i32 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <32 x i32> [[TMP20]], i32 [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <32 x i32> [[TMP21]], i32 [[TMP1]], i32 4 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <32 x i32> [[TMP22]], i32 [[TMP1]], i32 5 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <32 x i32> [[TMP23]], i32 [[TMP1]], i32 6 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <32 x i32> [[TMP24]], i32 [[TMP1]], i32 7 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <32 x i32> [[TMP25]], i32 [[TMP1]], i32 8 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <32 x i32> [[TMP26]], i32 [[TMP1]], i32 9 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <32 x i32> [[TMP27]], i32 [[TMP1]], i32 10 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <32 x i32> [[TMP28]], i32 [[TMP1]], i32 11 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <32 x i32> [[TMP29]], i32 [[TMP1]], i32 12 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <32 x i32> [[TMP30]], i32 [[TMP1]], i32 13 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <32 x i32> [[TMP31]], i32 [[TMP1]], i32 14 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <32 x i32> [[TMP32]], i32 [[TMP1]], i32 15 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <32 x i32> [[TMP33]], i32 [[TMP1]], i32 16 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <32 x i32> [[TMP34]], i32 [[TMP1]], i32 17 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <32 x i32> [[TMP35]], i32 [[TMP1]], i32 18 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <32 x i32> [[TMP36]], i32 [[TMP1]], i32 19 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <32 x i32> [[TMP37]], i32 [[TMP1]], i32 20 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <32 x i32> [[TMP38]], i32 [[TMP1]], i32 21 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <32 x i32> [[TMP39]], i32 [[TMP1]], i32 22 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <32 x i32> [[TMP40]], i32 [[TMP1]], i32 23 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <32 x i32> [[TMP41]], i32 [[TMP1]], i32 24 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <32 x i32> [[TMP42]], i32 [[TMP1]], i32 25 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <32 x i32> [[TMP43]], i32 [[TMP1]], i32 26 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <32 x i32> [[TMP44]], i32 [[TMP1]], i32 27 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <32 x i32> [[TMP45]], i32 [[TMP1]], i32 28 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <32 x i32> [[TMP46]], i32 [[TMP1]], i32 29 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <32 x i32> [[TMP47]], i32 [[TMP1]], i32 30 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <32 x i32> [[TMP48]], i32 [[TMP1]], i32 31 -; CHECK-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> [[TMP49]]) -; CHECK-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP17]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP50]], [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TMP52]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[OP_RDX1]], [[OP_RDX4]] -; CHECK-NEXT: [[VAL64:%.*]] = add i32 undef, [[OP_RDX5]] +; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ undef, [[BB1]] ], [ undef, [[BB2:%.*]] ] +; CHECK-NEXT: [[VAL4:%.*]] = phi i32 [ undef, [[BB1]] ], [ undef, [[BB2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[VAL4]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i32> poison, i32 [[VAL4]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[SHUFFLE1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[VAL4]], [[VAL4]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[VAL4]], [[VAL4]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[VAL4]], [[VAL4]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[VAL4]], [[VAL4]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[VAL4]], [[VAL4]] +; CHECK-NEXT: [[OP_RDX7:%.*]] = mul i32 [[VAL4]], [[VAL]] +; CHECK-NEXT: [[OP_RDX8:%.*]] = mul i32 [[OP_RDX2]], [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX9:%.*]] = mul i32 [[OP_RDX4]], [[OP_RDX5]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = mul i32 [[OP_RDX6]], [[OP_RDX7]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = mul i32 [[OP_RDX8]], [[OP_RDX9]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = mul i32 [[OP_RDX11]], [[OP_RDX10]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX12]] +; CHECK-NEXT: [[VAL64:%.*]] = add i32 undef, [[OP_RDX13]] ; CHECK-NEXT: [[VAL65:%.*]] = sext i32 [[VAL64]] to i64 ; CHECK-NEXT: ret i64 [[VAL65]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -9,13 +9,13 @@ ; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_1]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll @@ -63,12 +63,9 @@ ; PR41892 define void @test_v4f32_v2f32_store(<4 x float> %f, float* %p){ ; CHECK-LABEL: @test_v4f32_v2f32_store( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x float> [[F]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[X1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: ret void ; %x0 = extractelement <4 x float> %f, i64 0 @@ -96,14 +93,11 @@ define void @test_v4f32_v3f32_store(<4 x float> %f, float* %p){ ; CHECK-LABEL: @test_v4f32_v3f32_store( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x float> [[F]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x float> [[F]], i64 2 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x float> [[F:%.*]], i64 2 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[X1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[P]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[P]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: store float [[X2]], float* [[P2]], align 4 ; CHECK-NEXT: ret void ; @@ -159,11 +153,9 @@ define void @test_v4f32_v4f32_splat_store(<4 x float> %f, float* %p){ ; CHECK-LABEL: @test_v4f32_v4f32_splat_store( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: ret void ; %x0 = extractelement <4 x float> %f, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -45,11 +45,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -61,11 +61,11 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE2-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]] ; SSE2-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -97,11 +97,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -113,11 +113,11 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] +; SSE2-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; SSE2-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -149,11 +149,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -165,11 +165,11 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] +; SSE2-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; SSE2-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -203,11 +203,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -219,11 +219,11 @@ ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] -; SSE2-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; SSE2-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; SSE2-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -255,12 +255,12 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 ; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -274,11 +274,11 @@ ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] -; SSE2-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; SSE2-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; SSE2-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -311,9 +311,9 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 ; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] @@ -330,11 +330,11 @@ ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; SSE2-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]] -; SSE2-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; SSE2-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; SSE2-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; SSE2-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -373,26 +373,26 @@ ; CHECK: for.body3: ; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 4 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP9]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] ; CHECK-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i64 3 ; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] ; CHECK-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 @@ -410,26 +410,26 @@ ; SSE2: for.body3: ; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] -; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 -; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] -; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; SSE2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] ; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; SSE2-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] +; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP5]] +; SSE2-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; SSE2-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 4 +; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP7]] +; SSE2-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; SSE2-NEXT: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 4 +; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP2]], <4 x i32> +; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP9]], [[TMP10]] ; SSE2-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* ; SSE2-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; SSE2-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] ; SSE2-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 +; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i64 3 ; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] ; SSE2-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 ; SSE2-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll @@ -15,10 +15,10 @@ ; CHECK: while.body.lr.ph: ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 ; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP3]], i1 [[ICMP_A1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> [[TMP2]], i1 [[ICMP_A1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP0]] ; CHECK-NEXT: br label [[WHILE_END]] ; CHECK: while.end: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5]], [[WHILE_BODY_LR_PH]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll @@ -13,11 +13,11 @@ ; CHECK: if.end: ; CHECK-NEXT: [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef ; CHECK-NEXT: [[SHR15:%.*]] = ashr i32 [[SUB14]], 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[SHR15]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[SUB14]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP0]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SHUFFLE]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], undef ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> undef ; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[TMP5]] to <4 x i64> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -145,40 +145,47 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP0]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] ; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP12]] = load <2 x float>, <2 x float>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], -; CHECK-NEXT: [[TMP16]] = fadd <4 x float> [[TMP6]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP11]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[ARRAYIDX14]] to <2 x float>* +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x float>, <2 x float>* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[SHUFFLE]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], +; CHECK-NEXT: [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP23]] = insertelement <2 x float> [[TMP22]], float [[TMP21]], i32 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP19]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[TMP19]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP19]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP27]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll @@ -9,16 +9,16 @@ define void @test(double* %i1, double* %i2, double* %o) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr double, double* [[I1:%.*]], i64 1 -; CHECK-NEXT: [[I1_0:%.*]] = load double, double* [[I1]], align 16 +; CHECK-NEXT: [[I1_0:%.*]] = load double, double* [[I1:%.*]], align 16 +; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr double, double* [[I1]], i64 1 ; CHECK-NEXT: [[I1_1:%.*]] = load double, double* [[I1_GEP1]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1_0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1 ; CHECK-NEXT: br i1 undef, label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: ; CHECK-NEXT: [[I2_GEP0:%.*]] = getelementptr inbounds double, double* [[I2:%.*]], i64 0 -; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1 ; CHECK-NEXT: [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16 +; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1 ; CHECK-NEXT: [[I2_1:%.*]] = load double, double* [[I2_GEP1]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I2_0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -35,12 +35,12 @@ ; AVX-NEXT: store i64 [[OR_1]], i64* undef, align 8 ; AVX-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; AVX-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[BAR5]], i32 1 ; AVX-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 -; AVX-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; AVX-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; AVX-NEXT: [[TMP2:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; AVX-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8 +; AVX-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP1]], [[TMP3]] ; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* ; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 ; AVX-NEXT: ret void @@ -67,20 +67,20 @@ ; SSE-LABEL: @pr35497( ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 ; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP8]], align 1 +; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP7]], ; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], ; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], ; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] @@ -91,20 +91,20 @@ ; AVX-LABEL: @pr35497( ; AVX-NEXT: entry: ; AVX-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef ; AVX-NEXT: store i64 [[ADD]], i64* undef, align 1 ; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 ; AVX-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], +; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP8]], align 1 +; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP7]], ; AVX-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], ; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], ; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -143,11 +143,11 @@ ; ; AVX-LABEL: @store_i64( ; AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]] +; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX-NEXT: [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP7]], [[SHUFFLE]] ; AVX-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], ; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> ; AVX-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -26,18 +26,18 @@ ; AVX-LABEL: @foo( ; AVX-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 ; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( ; AVX512-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 ; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -7,71 +7,67 @@ define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; SSE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; SSE-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX2-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX512F-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX512F-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX512F-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load( @@ -129,16 +125,16 @@ ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -147,16 +143,16 @@ ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -165,16 +161,16 @@ ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -257,28 +253,28 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -286,28 +282,28 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -543,188 +539,183 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 ; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 ; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2 -; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3 -; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] -; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; SSE-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP11]], <4 x i32> +; SSE-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; SSE-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP16]], <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP8]], i64 3 +; SSE-NEXT: [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> [[TMP21]], <4 x i32> +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = fdiv <4 x float> [[TMP23]], [[TMP20]] +; SSE-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP24]], <4 x float>* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; SSE-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2 -; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3 -; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] -; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP35:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP38]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP31]], i64 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP35]], i64 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP45:%.*]] = load float, float* [[TMP44]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i64 3 +; SSE-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP49]] +; SSE-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP50]], <4 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 -; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP14:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP10]], i64 4 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP14]], i64 5 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP18]], i64 6 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP22]], i64 7 +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i64 3 +; AVX-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP12]], i64 4 +; AVX-NEXT: [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP16]], i64 5 +; AVX-NEXT: [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP20]], i64 6 +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP29]], i64 7 +; AVX-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX-NEXT: [[TMP36:%.*]] = load <2 x float>, <2 x float>* [[TMP35]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX-NEXT: [[TMP38:%.*]] = load <2 x float>, <2 x float>* [[TMP37]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> [[TMP36]], <8 x i32> +; AVX-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX-NEXT: [[TMP41:%.*]] = load <2 x float>, <2 x float>* [[TMP40]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP42:%.*]] = shufflevector <2 x float> [[TMP36]], <2 x float> [[TMP41]], <8 x i32> +; AVX-NEXT: [[TMP43:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP44:%.*]] = shufflevector <8 x float> [[TMP42]], <8 x float> [[TMP43]], <8 x i32> +; AVX-NEXT: [[TMP45:%.*]] = shufflevector <8 x float> [[TMP44]], <8 x float> [[TMP34]], <8 x i32> +; AVX-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[TMP41]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP46]], <8 x i32> +; AVX-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP47]], <8 x i32> +; AVX-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP48]], [[TMP45]] +; AVX-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 -; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP14:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP10]], i64 4 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP14]], i64 5 +; AVX2-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP18]], i64 6 +; AVX2-NEXT: [[TMP27:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP22]], i64 7 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i64 3 +; AVX2-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP12]], i64 4 +; AVX2-NEXT: [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP16]], i64 5 +; AVX2-NEXT: [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP20]], i64 6 +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP29]], i64 7 +; AVX2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX2-NEXT: [[TMP36:%.*]] = load <2 x float>, <2 x float>* [[TMP35]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX2-NEXT: [[TMP38:%.*]] = load <2 x float>, <2 x float>* [[TMP37]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> [[TMP36]], <8 x i32> +; AVX2-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX2-NEXT: [[TMP41:%.*]] = load <2 x float>, <2 x float>* [[TMP40]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP42:%.*]] = shufflevector <2 x float> [[TMP36]], <2 x float> [[TMP41]], <8 x i32> +; AVX2-NEXT: [[TMP43:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP44:%.*]] = shufflevector <8 x float> [[TMP42]], <8 x float> [[TMP43]], <8 x i32> +; AVX2-NEXT: [[TMP45:%.*]] = shufflevector <8 x float> [[TMP44]], <8 x float> [[TMP34]], <8 x i32> +; AVX2-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[TMP41]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP46]], <8 x i32> +; AVX2-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP47]], <8 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP48]], [[TMP45]] +; AVX2-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX2-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <16 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x float*> [[TMP3]], <16 x float*> poison, <16 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <16 x float*> [[SHUFFLE]], <16 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP4]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP6]] ; AVX512F-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512F-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <16 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x float*> [[TMP3]], <16 x float*> poison, <16 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <16 x float*> [[SHUFFLE]], <16 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP4]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP6]] ; AVX512VL-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512VL-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -7,71 +7,67 @@ define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 -; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; SSE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; SSE-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX2-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load( -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 1 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 2 +; AVX512F-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* +; AVX512F-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], +; AVX512F-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load( @@ -129,16 +125,16 @@ ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -147,16 +143,16 @@ ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -165,16 +161,16 @@ ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -257,28 +253,28 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -286,28 +282,28 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -543,188 +539,183 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 ; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 ; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2 -; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3 -; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] -; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; SSE-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP11]], <4 x i32> +; SSE-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; SSE-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP16]], <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP8]], i64 3 +; SSE-NEXT: [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> [[TMP21]], <4 x i32> +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = fdiv <4 x float> [[TMP23]], [[TMP20]] +; SSE-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP24]], <4 x float>* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; SSE-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2 -; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3 -; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] -; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP35:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP38]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP31]], i64 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP35]], i64 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP45:%.*]] = load float, float* [[TMP44]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP45]], i64 3 +; SSE-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP49]] +; SSE-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP50]], <4 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 -; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP14:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP10]], i64 4 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP14]], i64 5 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP18]], i64 6 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP22]], i64 7 +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i64 3 +; AVX-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP12]], i64 4 +; AVX-NEXT: [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP16]], i64 5 +; AVX-NEXT: [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP20]], i64 6 +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP29]], i64 7 +; AVX-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX-NEXT: [[TMP36:%.*]] = load <2 x float>, <2 x float>* [[TMP35]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX-NEXT: [[TMP38:%.*]] = load <2 x float>, <2 x float>* [[TMP37]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> [[TMP36]], <8 x i32> +; AVX-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX-NEXT: [[TMP41:%.*]] = load <2 x float>, <2 x float>* [[TMP40]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP42:%.*]] = shufflevector <2 x float> [[TMP36]], <2 x float> [[TMP41]], <8 x i32> +; AVX-NEXT: [[TMP43:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP44:%.*]] = shufflevector <8 x float> [[TMP42]], <8 x float> [[TMP43]], <8 x i32> +; AVX-NEXT: [[TMP45:%.*]] = shufflevector <8 x float> [[TMP44]], <8 x float> [[TMP34]], <8 x i32> +; AVX-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[TMP41]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP46]], <8 x i32> +; AVX-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP47]], <8 x i32> +; AVX-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP48]], [[TMP45]] +; AVX-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 -; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP14:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP10]], i64 4 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP14]], i64 5 +; AVX2-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP18]], i64 6 +; AVX2-NEXT: [[TMP27:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP22]], i64 7 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i64 3 +; AVX2-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP12]], i64 4 +; AVX2-NEXT: [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP16]], i64 5 +; AVX2-NEXT: [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP20]], i64 6 +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP29]], i64 7 +; AVX2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; AVX2-NEXT: [[TMP36:%.*]] = load <2 x float>, <2 x float>* [[TMP35]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP4]] to <2 x float>* +; AVX2-NEXT: [[TMP38:%.*]] = load <2 x float>, <2 x float>* [[TMP37]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> [[TMP36]], <8 x i32> +; AVX2-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP5]] to <2 x float>* +; AVX2-NEXT: [[TMP41:%.*]] = load <2 x float>, <2 x float>* [[TMP40]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP42:%.*]] = shufflevector <2 x float> [[TMP36]], <2 x float> [[TMP41]], <8 x i32> +; AVX2-NEXT: [[TMP43:%.*]] = shufflevector <2 x float> [[TMP38]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP44:%.*]] = shufflevector <8 x float> [[TMP42]], <8 x float> [[TMP43]], <8 x i32> +; AVX2-NEXT: [[TMP45:%.*]] = shufflevector <8 x float> [[TMP44]], <8 x float> [[TMP34]], <8 x i32> +; AVX2-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[TMP41]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP46]], <8 x i32> +; AVX2-NEXT: [[TMP48:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP47]], <8 x i32> +; AVX2-NEXT: [[TMP49:%.*]] = fdiv <8 x float> [[TMP48]], [[TMP45]] +; AVX2-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX2-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP50]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <16 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x float*> [[TMP3]], <16 x float*> poison, <16 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <16 x float*> [[SHUFFLE]], <16 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP4]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP6]] ; AVX512F-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512F-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP5]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP6]], [[TMP7]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <16 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x float*> [[TMP3]], <16 x float*> poison, <16 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <16 x float*> [[SHUFFLE]], <16 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP4]], i32 4, <16 x i1> , <16 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP6]] ; AVX512VL-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512VL-NEXT: store <8 x float> [[TMP8]], <8 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll @@ -7,66 +7,34 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %x, ptr nocapture noundef nonnull readonly align 2 dereferenceable(16) %y) { ; SSE-LABEL: @compute_min( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2 -; SSE-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2 -; SSE-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) -; SSE-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 -; SSE-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 -; SSE-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 -; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 -; SSE-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) -; SSE-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 -; SSE-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 -; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2 -; SSE-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2 -; SSE-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) -; SSE-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3 -; SSE-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3 -; SSE-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2 -; SSE-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2 -; SSE-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) -; SSE-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 -; SSE-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 -; SSE-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 -; SSE-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 -; SSE-NEXT: [[TMP14:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP12]], i16 [[TMP13]]) -; SSE-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 -; SSE-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 -; SSE-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 -; SSE-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 -; SSE-NEXT: [[TMP17:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP15]], i16 [[TMP16]]) -; SSE-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 -; SSE-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 -; SSE-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2 -; SSE-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2 -; SSE-NEXT: [[TMP20:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP18]], i16 [[TMP19]]) -; SSE-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7 -; SSE-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7 -; SSE-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2 -; SSE-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2 -; SSE-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP21]], i16 [[TMP22]]) -; SSE-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 -; SSE-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48 -; SSE-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 -; SSE-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32 -; SSE-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 -; SSE-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 -; SSE-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 -; SSE-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] -; SSE-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 -; SSE-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP23]] to i64 -; SSE-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48 -; SSE-NEXT: [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[TMP20]] to i64 -; SSE-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32 -; SSE-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP17]] to i64 -; SSE-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 -; SSE-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] -; SSE-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP14]] to i64 -; SSE-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] -; SSE-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 +; SSE-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <2 x i32> +; SSE-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP6]]) +; SSE-NEXT: [[TMP11:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP7]]) +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP8]]) +; SSE-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP9]]) +; SSE-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> +; SSE-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], +; SSE-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP12]] to <2 x i64> +; SSE-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], +; SSE-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] +; SSE-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP11]] to <2 x i64> +; SSE-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], +; SSE-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] +; SSE-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> +; SSE-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] +; SSE-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 +; SSE-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 +; SSE-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 +; SSE-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 ; SSE-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; ; AVX-LABEL: @compute_min( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -95,19 +95,16 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { ; SSE-LABEL: @logical_and_icmp_diff_preds( ; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 ; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 ; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 ; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X3]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X1]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = icmp slt <2 x i32> [[TMP2]], zeroinitializer -; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false ; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP5]], i1 false +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false ; SSE-NEXT: ret i1 [[S3]] ; ; AVX-LABEL: @logical_and_icmp_diff_preds( @@ -187,16 +184,13 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) { ; SSE-LABEL: @logical_and_icmp_subvec( -; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X1]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X0]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = icmp slt <2 x i32> [[TMP2]], zeroinitializer +; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer ; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 0 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; SSE-NEXT: [[S1:%.*]] = select i1 [[TMP5]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[TMP4]], i1 [[TMP3]], i1 false ; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false ; SSE-NEXT: ret i1 [[S2]] ; @@ -337,27 +331,20 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: @logical_and_icmp_clamp_v8i32( -; CHECK-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 ; CHECK-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 ; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 ; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y3]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = freeze <8 x i1> [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP10]]) -; CHECK-NEXT: ret i1 [[TMP11]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[Y1]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[Y2]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y3]], i32 7 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 @@ -388,20 +375,17 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { ; SSE-LABEL: @logical_and_icmp_clamp_partial( ; SSE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = icmp slt <2 x i32> [[TMP2]], ; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; SSE-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false +; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false ; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 [[OP_RDX1]], i1 false +; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false ; SSE-NEXT: ret i1 [[OP_RDX2]] ; ; AVX-LABEL: @logical_and_icmp_clamp_partial( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll @@ -9,10 +9,11 @@ ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE1]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP65]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -18,41 +18,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; SSE2-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; SSE2-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; SSE2-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; SSE2-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; SSE2-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0 -; SSE2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4 -; SSE2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5 -; SSE2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6 -; SSE2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7 -; SSE2-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; SSE2-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; SSE2-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; SSE2-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; SSE2-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; SSE2-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; SSE2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1 -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2 -; SSE2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3 -; SSE2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4 -; SSE2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5 -; SSE2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6 -; SSE2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7 -; SSE2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]]) -; SSE2-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]] +; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -70,41 +40,11 @@ ; ; AVX-LABEL: @reduce_and4( ; AVX-NEXT: entry: -; AVX-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; AVX-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; AVX-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; AVX-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; AVX-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; AVX-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; AVX-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; AVX-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; AVX-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7 -; AVX-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; AVX-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; AVX-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; AVX-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; AVX-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; AVX-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; AVX-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; AVX-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7 -; AVX-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]]) -; AVX-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]] +; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; @@ -154,41 +94,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4_transpose( -; SSE2-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; SSE2-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; SSE2-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; SSE2-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; SSE2-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; SSE2-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; SSE2-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; SSE2-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; SSE2-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3 -; SSE2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4 -; SSE2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5 -; SSE2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6 -; SSE2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7 -; SSE2-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; SSE2-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0 -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1 -; SSE2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2 -; SSE2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3 -; SSE2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4 -; SSE2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5 -; SSE2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6 -; SSE2-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7 -; SSE2-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]]) -; SSE2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]] +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -204,41 +114,11 @@ ; SSE42-NEXT: ret i32 [[OP_RDX3]] ; ; AVX-LABEL: @reduce_and4_transpose( -; AVX-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; AVX-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; AVX-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; AVX-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; AVX-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; AVX-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; AVX-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; AVX-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; AVX-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; AVX-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; AVX-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; AVX-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; AVX-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; AVX-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7 -; AVX-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; AVX-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6 -; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7 -; AVX-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]]) -; AVX-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]] +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -87,15 +87,15 @@ ; CHECK-LABEL: @fcmp_lt_gt( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[MUL]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 @@ -136,15 +136,15 @@ define i1 @fcmp_lt(double %a, double %b, double %c) { ; CHECK-LABEL: @fcmp_lt( ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[MUL]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[B]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = fcmp uge <2 x double> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -6,13 +6,13 @@ ; CHECK-LABEL: @fextr( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <8 x i16>, <8 x i16>* undef, align 16 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: br label [[T:%.*]] ; CHECK: t: ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[LD]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP0]], <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[LD]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP1]], <8 x i16>* [[TMP2]], align 2 ; CHECK-NEXT: ret void ; ; YAML: Pass: slp-vectorizer @@ -20,7 +20,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-20' +; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -13,18 +13,16 @@ ; CHECK-NEXT: [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 16 -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP9]], [[A_088]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP8]], [[A_088]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] @@ -93,7 +91,7 @@ ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-5' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '4' + ; YAML-NEXT: - TreeSize: '5' ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll @@ -70,7 +70,7 @@ ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '4' + ; YAML-NEXT: - TreeSize: '6' %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -5,15 +5,15 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float*> poison, float* [[P:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2 ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP3]], i32 4, <8 x i1> , <8 x float> undef) -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> , <16 x float> [[SHUFFLE1]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[SHUFFLE1]], [[TMP6]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP4]] to <16 x float>* -; CHECK-NEXT: store <16 x float> [[SHUFFLE2]], <16 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[SHUFFLE1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP4]] to <16 x float>* +; CHECK-NEXT: store <16 x float> [[TMP8]], <16 x float>* [[TMP9]], align 4 ; CHECK-NEXT: ret void ; %2 = getelementptr inbounds float, float* %p, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -8,14 +8,14 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8 addrspace(1)*> poison, i8 addrspace(1)* [[TMP0:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8 addrspace(1)*> [[TMP3]], <4 x i8 addrspace(1)*> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, <4 x i8 addrspace(1)*> [[SHUFFLE]], <4 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, <4 x i8 addrspace(1)*> [[SHUFFLE]], <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[TMP5]] to float addrspace(1)* ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i8 addrspace(1)*> [[TMP4]] to <4 x float addrspace(1)*> ; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1f32(<4 x float addrspace(1)*> [[TMP7]], i32 4, <4 x i1> , <4 x float> undef) -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float addrspace(1)* [[TMP6]] to <8 x float> addrspace(1)* ; CHECK-NEXT: [[TMP10:%.*]] = load <8 x float>, <8 x float> addrspace(1)* [[TMP9]], align 4 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE1]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -9,33 +9,30 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 256, 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP3]] to <2 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP4]] to <2 x float>* ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP9]], [[TMP14]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = fsub <2 x float> [[TMP12]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x float> [[TMP12]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> -; CHECK-NEXT: [[TMP19]] = fadd <2 x float> [[TMP2]], [[TMP18]] -; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]] -; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x float> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP17]] = fadd <2 x float> [[TMP2]], [[TMP16]] +; CHECK-NEXT: [[TMP18]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP19]], <2 x float>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP17]], <2 x float>* [[TMP21]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -14,15 +14,15 @@ ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[SHUFFLE1]], +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[SHUFFLE1]], ; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 ; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 ; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = lshr <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_4_I_I]], i32 5 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_5_I_I]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/return.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll @@ -42,12 +42,11 @@ define double @return2(double* nocapture readonly %x) { ; CHECK-LABEL: @return2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[X:%.*]] to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 ; CHECK-NEXT: [[ADD5:%.*]] = fadd double [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -9,14 +9,14 @@ ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr undef, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> , <2 x float> [[TMP0]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> zeroinitializer, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr undef, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> zeroinitializer, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer @@ -24,7 +24,7 @@ ; CHECK: bb3: ; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll @@ -12,8 +12,14 @@ define void @test(float * %a, float * %b, float * %c, float * %d) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[L0:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NEXT: [[L1:%.*]] = load float, float* [[A1]], align 4 +; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 +; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A2]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: call void @unknown() @@ -42,8 +48,10 @@ ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: call void @unknown() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: store float [[L0]], float* [[B]], align 4 +; CHECK-NEXT: store float [[L1]], float* [[B1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B2]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[D:%.*]] to <4 x float>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -12,18 +12,16 @@ ; CHECK-NEXT: [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 16 -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP9]], [[A_088]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP8]], [[A_088]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -8,10 +8,9 @@ ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], undef ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> undef, <4 x i32> [[SHUFFLE1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-2 | FileCheck %s +;FIXME: Looks like a problem with the cost of mul <4 x i32> ops, for some reason it is 6 instead of 1-2. + define void @foo(i32* noalias nocapture writeonly %B, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %C, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -62,14 +62,14 @@ ; Don't vectorize volatile loads. define void @test_volatile_load(double* %a, double* %b, double* %c) { ; CHECK-LABEL: @test_volatile_load( -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 -; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A]], align 8 -; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B]], align 8 +; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8 +; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -9,27 +9,42 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 1 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 3 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4 +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 5 ; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 6 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14 +; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 7 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 5 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32*> poison, i32* [[ARRAYIDX1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32*> [[TMP2]], i32* [[ARRAYIDX6]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32*> [[TMP3]], i32* [[ARRAYIDX13]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32*> [[TMP4]], i32* [[ARRAYIDX20]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32*> [[TMP5]], i32* [[ARRAYIDX27]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32*> [[TMP6]], i32* [[ARRAYIDX34]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32*> [[TMP7]], i32* [[ARRAYIDX41]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef) -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32*> poison, i32* [[ARRAYIDX]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32*> [[TMP0]], i32* [[ARRAYIDX4]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32*> [[TMP1]], i32* [[ARRAYIDX11]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32*> [[TMP2]], i32* [[ARRAYIDX18]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32*> [[TMP3]], i32* [[ARRAYIDX25]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32*> [[TMP4]], i32* [[ARRAYIDX32]], i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32*> [[TMP5]], i32* [[ARRAYIDX39]], i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32*> [[TMP6]], i32* [[ARRAYIDX46]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32*> [[TMP7]], i32* [[ARRAYIDX20]], i32 8 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 9 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32*> [[TMP9]], i32* [[ARRAYIDX13]], i32 10 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32*> [[TMP10]], i32* [[ARRAYIDX6]], i32 11 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32*> [[TMP11]], i32* [[ARRAYIDX27]], i32 12 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32*> [[TMP12]], i32* [[ARRAYIDX34]], i32 13 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32*> [[TMP13]], i32* [[ARRAYIDX41]], i32 14 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32*> [[TMP14]], i32* [[ARRAYIDX1]], i32 15 +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = add nsw <8 x i32> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP19]], <8 x i32>* [[TMP20]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -110,12 +125,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -172,19 +185,17 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[G22]] to <2 x i32>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP13]], <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[G20]] to <2 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[G22]] to <2 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP12]], <8 x i32>* [[TMP13]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,22 +4,19 @@ define void @test(i32* noalias %p, i32* noalias %addr, i32* noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32*> poison, i32* [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i32*> [[TMP0]], <8 x i32*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE1]], <8 x i32> ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 8, <8 x i1> , <8 x i32> undef) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32*> poison, i32* [[P:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i32*> [[TMP4]], <8 x i32*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE2]], <8 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP5]], i32 4, <8 x i1> , <8 x i32> undef) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP1]], i32 8, <8 x i1> , <8 x i32> undef) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE2]], <8 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP8]], i32 4, <8 x i1> , <8 x i32> undef) -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP10]], <8 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ADDR:%.*]] to <16 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32*> poison, i32* [[P:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32*> [[TMP2]], <16 x i32*> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, <16 x i32*> [[SHUFFLE]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP3]], i32 4, <16 x i1> , <16 x i32> undef) +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP7]], <8 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll @@ -161,10 +161,10 @@ ; CHECK-NEXT: [[STACK:%.*]] = call i8* @llvm.stacksave() ; CHECK-NEXT: [[V1:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[V2:%.*]] = alloca inalloca i8, align 1 -; CHECK-NEXT: call void @use(i8* inalloca(i8) [[V2]]) #[[ATTR4]] -; CHECK-NEXT: call void @llvm.stackrestore(i8* [[STACK]]) ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1 +; CHECK-NEXT: call void @use(i8* inalloca(i8) [[V2]]) #[[ATTR4]] +; CHECK-NEXT: call void @llvm.stackrestore(i8* [[STACK]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[B:%.*]] to <2 x i8*>* ; CHECK-NEXT: store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP4]], align 8 @@ -264,9 +264,9 @@ ; CHECK-NEXT: store i8 0, i8* [[V1]], align 1 ; CHECK-NEXT: call void @llvm.stackrestore(i8* [[STACK]]) ; CHECK-NEXT: [[V2:%.*]] = alloca i8, align 1 -; CHECK-NEXT: store i8 0, i8* [[V2]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1 +; CHECK-NEXT: store i8 0, i8* [[V2]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[B:%.*]] to <2 x i8*>* ; CHECK-NEXT: store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP4]], align 8 @@ -309,19 +309,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8** [[VAR32]] to <4 x i8*>* ; CHECK-NEXT: [[VAR36:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 4 ; CHECK-NEXT: [[VAR4:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8*> poison, i8* [[VAR4]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8*> [[TMP2]], <4 x i8*> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VAR5:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8*> poison, i8* [[VAR4]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i8*> [[TMP3]], i8* [[VAR5]], i32 1 ; CHECK-NEXT: [[VAR17:%.*]] = call i8* @wibble(i8* [[VAR4]]) ; CHECK-NEXT: [[VAR23:%.*]] = call i8* @llvm.stacksave() ; CHECK-NEXT: [[VAR24:%.*]] = alloca inalloca i32, align 4 ; CHECK-NEXT: call void @quux(i32* inalloca(i32) [[VAR24]]) ; CHECK-NEXT: call void @llvm.stackrestore(i8* [[VAR23]]) -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8*> poison, i8* [[VAR4]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8*> [[TMP2]], <4 x i8*> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: store <4 x i8*> [[SHUFFLE]], <4 x i8*>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8*> [[TMP2]], i8* [[VAR5]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i8*> [[TMP3]], <4 x i8*> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>* -; CHECK-NEXT: store <4 x i8*> [[SHUFFLE1]], <4 x i8*>* [[TMP4]], align 8 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i8*> [[TMP4]], <2 x i8*> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>* +; CHECK-NEXT: store <4 x i8*> [[SHUFFLE1]], <4 x i8*>* [[TMP5]], align 8 ; CHECK-NEXT: ret void ; %var2 = alloca i8 @@ -361,9 +362,9 @@ ; CHECK-NEXT: [[VAR36:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 4 ; CHECK-NEXT: [[VAR4:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[VAR5:%.*]] = alloca i8, align 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8*> poison, i8* [[VAR4]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8*> [[TMP1]], i8* [[VAR5]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8*> [[TMP2]], <4 x i8*> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[VAR4]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[VAR5]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8*> [[TMP2]], <2 x i8*> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>* ; CHECK-NEXT: store <4 x i8*> [[SHUFFLE]], <4 x i8*>* [[TMP3]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -9,26 +9,26 @@ ; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 ; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_4]], i32 1 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 +; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 +; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[LOAD_7]], i32 1 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP4]], [[TMP8]] ; CHECK-NEXT: br label [[BLOCK1:%.*]] ; CHECK: block1: ; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 5 @@ -40,7 +40,7 @@ ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 ; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP9]], <2 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -8,25 +8,21 @@ ; ENABLED-LABEL: @test_supernode_add( ; ENABLED-NEXT: entry: ; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 ; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x i32> +; ENABLED-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP6]] +; ENABLED-NEXT: [[TMP10:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -65,25 +61,21 @@ ; ENABLED-LABEL: @test_supernode_addsub( ; ENABLED-NEXT: entry: ; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 ; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x i32> +; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP3]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP6]] +; ENABLED-NEXT: [[TMP10:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -205,24 +197,23 @@ ; ENABLED-NEXT: entry: ; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 -; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 ; ENABLED-NEXT: [[IDXC:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXD:%.*]] = getelementptr inbounds double, double* [[DARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[C:%.*]] = load double, double* [[IDXC]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 ; ENABLED-NEXT: [[D:%.*]] = load double, double* [[IDXD]], align 8 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[D]], i32 1 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP0]], <2 x i32> +; ENABLED-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP6]] +; ENABLED-NEXT: [[TMP10:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -157,16 +157,16 @@ ; CHECK-NEXT: [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP1]], i32 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] @@ -253,22 +253,12 @@ ; CHECK-LABEL: @tiny_vector_gather( ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[V2:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR0]], align 16 -; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR1]], align 4 -; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR2]], align 8 -; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR3]], align 4 -; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR4]], align 16 -; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 5 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR5]], align 4 -; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 6 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR6]], align 8 -; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 7 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR7]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[PTR0]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; %1 = load i32, i32* %v1, align 4 @@ -296,10 +286,10 @@ ; CHECK-LABEL: @tiny_vector_with_diff_opcode( ; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 undef to i16 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i16> [[TMP3]], i16 [[TMP2]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP2]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[PTR0]] to <8 x i16>* ; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP5]], align 16 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll @@ -28,101 +28,101 @@ ; CHECK-NEXT: [[B_0:%.*]] = phi i32 [ [[SPEC_SELECT8_3_7:%.*]], [[FOR_COND]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], -183 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = sub <4 x i32> [[SHUFFLE]], [[TMP0]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP20]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP21]], [[B_0]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP21]], i32 [[B_0]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[SHUFFLE]], [[TMP0]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> [[TMP18]] +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP21]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP22]], [[B_0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP22]], i32 [[B_0]] ; CHECK-NEXT: [[SUB_116:%.*]] = sub i32 [[TMP15]], [[TMP1]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp slt i32 [[SUB_116]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = icmp slt i32 [[SUB_116]], 0 ; CHECK-NEXT: [[NEG_117:%.*]] = sub nsw i32 0, [[SUB_116]] -; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[NEG_117]], i32 [[SUB_116]] -; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP23]], [[OP_RDX1]] -; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP23]], i32 [[OP_RDX1]] +; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[NEG_117]], i32 [[SUB_116]] +; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP24]], [[OP_RDX1]] +; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP24]], i32 [[OP_RDX1]] ; CHECK-NEXT: [[SUB_1_1:%.*]] = sub i32 [[TMP15]], [[TMP2]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i32 [[SUB_1_1]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = icmp slt i32 [[SUB_1_1]], 0 ; CHECK-NEXT: [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]] -; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[NEG_1_1]], i32 [[SUB_1_1]] -; CHECK-NEXT: [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP25]], [[SPEC_SELECT8_120]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[NEG_1_1]], i32 [[SUB_1_1]] +; CHECK-NEXT: [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP26]], [[SPEC_SELECT8_120]] ; CHECK-NEXT: [[NARROW:%.*]] = or i1 [[CMP12_1_1]], [[CMP12_118]] -; CHECK-NEXT: [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP25]], i32 [[SPEC_SELECT8_120]] +; CHECK-NEXT: [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP26]], i32 [[SPEC_SELECT8_120]] ; CHECK-NEXT: [[SUB_2_1:%.*]] = sub i32 [[TMP15]], [[TMP3]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp slt i32 [[SUB_2_1]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = icmp slt i32 [[SUB_2_1]], 0 ; CHECK-NEXT: [[NEG_2_1:%.*]] = sub nsw i32 0, [[SUB_2_1]] -; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[NEG_2_1]], i32 [[SUB_2_1]] -; CHECK-NEXT: [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP27]], [[SPEC_SELECT8_1_1]] +; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[NEG_2_1]], i32 [[SUB_2_1]] +; CHECK-NEXT: [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP28]], [[SPEC_SELECT8_1_1]] ; CHECK-NEXT: [[NARROW34:%.*]] = or i1 [[CMP12_2_1]], [[NARROW]] -; CHECK-NEXT: [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP27]], i32 [[SPEC_SELECT8_1_1]] +; CHECK-NEXT: [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP28]], i32 [[SPEC_SELECT8_1_1]] ; CHECK-NEXT: [[SUB_3_1:%.*]] = sub i32 [[TMP15]], [[TMP4]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp slt i32 [[SUB_3_1]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = icmp slt i32 [[SUB_3_1]], 0 ; CHECK-NEXT: [[NEG_3_1:%.*]] = sub nsw i32 0, [[SUB_3_1]] -; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[NEG_3_1]], i32 [[SUB_3_1]] -; CHECK-NEXT: [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP29]], [[SPEC_SELECT8_2_1]] +; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[NEG_3_1]], i32 [[SUB_3_1]] +; CHECK-NEXT: [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP30]], [[SPEC_SELECT8_2_1]] ; CHECK-NEXT: [[NARROW35:%.*]] = or i1 [[CMP12_3_1]], [[NARROW34]] ; CHECK-NEXT: [[SPEC_SELECT_3_1:%.*]] = zext i1 [[NARROW35]] to i32 -; CHECK-NEXT: [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP29]], i32 [[SPEC_SELECT8_2_1]] +; CHECK-NEXT: [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP30]], i32 [[SPEC_SELECT8_2_1]] ; CHECK-NEXT: [[SUB_222:%.*]] = sub i32 [[TMP15]], [[TMP5]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp slt i32 [[SUB_222]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = icmp slt i32 [[SUB_222]], 0 ; CHECK-NEXT: [[NEG_223:%.*]] = sub nsw i32 0, [[SUB_222]] -; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[NEG_223]], i32 [[SUB_222]] -; CHECK-NEXT: [[CMP12_224:%.*]] = icmp slt i32 [[TMP31]], [[SPEC_SELECT8_3_1]] -; CHECK-NEXT: [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP31]], i32 [[SPEC_SELECT8_3_1]] +; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[NEG_223]], i32 [[SUB_222]] +; CHECK-NEXT: [[CMP12_224:%.*]] = icmp slt i32 [[TMP32]], [[SPEC_SELECT8_3_1]] +; CHECK-NEXT: [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP32]], i32 [[SPEC_SELECT8_3_1]] ; CHECK-NEXT: [[SUB_1_2:%.*]] = sub i32 [[TMP15]], [[TMP6]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp slt i32 [[SUB_1_2]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = icmp slt i32 [[SUB_1_2]], 0 ; CHECK-NEXT: [[NEG_1_2:%.*]] = sub nsw i32 0, [[SUB_1_2]] -; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[NEG_1_2]], i32 [[SUB_1_2]] -; CHECK-NEXT: [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP33]], [[SPEC_SELECT8_226]] -; CHECK-NEXT: [[TMP34:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]] -; CHECK-NEXT: [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP33]], i32 [[SPEC_SELECT8_226]] +; CHECK-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[NEG_1_2]], i32 [[SUB_1_2]] +; CHECK-NEXT: [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP34]], [[SPEC_SELECT8_226]] +; CHECK-NEXT: [[TMP35:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]] +; CHECK-NEXT: [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP34]], i32 [[SPEC_SELECT8_226]] ; CHECK-NEXT: [[SUB_2_2:%.*]] = sub i32 [[TMP15]], [[TMP7]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp slt i32 [[SUB_2_2]], 0 +; CHECK-NEXT: [[TMP36:%.*]] = icmp slt i32 [[SUB_2_2]], 0 ; CHECK-NEXT: [[NEG_2_2:%.*]] = sub nsw i32 0, [[SUB_2_2]] -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[NEG_2_2]], i32 [[SUB_2_2]] -; CHECK-NEXT: [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP36]], [[SPEC_SELECT8_1_2]] -; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[CMP12_2_2]], [[TMP34]] -; CHECK-NEXT: [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP36]], i32 [[SPEC_SELECT8_1_2]] +; CHECK-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[NEG_2_2]], i32 [[SUB_2_2]] +; CHECK-NEXT: [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP37]], [[SPEC_SELECT8_1_2]] +; CHECK-NEXT: [[TMP38:%.*]] = or i1 [[CMP12_2_2]], [[TMP35]] +; CHECK-NEXT: [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP37]], i32 [[SPEC_SELECT8_1_2]] ; CHECK-NEXT: [[SUB_3_2:%.*]] = sub i32 [[TMP15]], [[TMP8]] -; CHECK-NEXT: [[TMP38:%.*]] = icmp slt i32 [[SUB_3_2]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = icmp slt i32 [[SUB_3_2]], 0 ; CHECK-NEXT: [[NEG_3_2:%.*]] = sub nsw i32 0, [[SUB_3_2]] -; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[NEG_3_2]], i32 [[SUB_3_2]] -; CHECK-NEXT: [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP39]], [[SPEC_SELECT8_2_2]] -; CHECK-NEXT: [[TMP40:%.*]] = or i1 [[CMP12_3_2]], [[TMP37]] -; CHECK-NEXT: [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP40]], i32 2, i32 [[SPEC_SELECT_3_1]] -; CHECK-NEXT: [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP39]], i32 [[SPEC_SELECT8_2_2]] +; CHECK-NEXT: [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[NEG_3_2]], i32 [[SUB_3_2]] +; CHECK-NEXT: [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP40]], [[SPEC_SELECT8_2_2]] +; CHECK-NEXT: [[TMP41:%.*]] = or i1 [[CMP12_3_2]], [[TMP38]] +; CHECK-NEXT: [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP41]], i32 2, i32 [[SPEC_SELECT_3_1]] +; CHECK-NEXT: [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP40]], i32 [[SPEC_SELECT8_2_2]] ; CHECK-NEXT: [[SUB_328:%.*]] = sub i32 [[TMP15]], [[TMP9]] -; CHECK-NEXT: [[TMP41:%.*]] = icmp slt i32 [[SUB_328]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = icmp slt i32 [[SUB_328]], 0 ; CHECK-NEXT: [[NEG_329:%.*]] = sub nsw i32 0, [[SUB_328]] -; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[NEG_329]], i32 [[SUB_328]] -; CHECK-NEXT: [[CMP12_330:%.*]] = icmp slt i32 [[TMP42]], [[SPEC_SELECT8_3_2]] -; CHECK-NEXT: [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP42]], i32 [[SPEC_SELECT8_3_2]] +; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[NEG_329]], i32 [[SUB_328]] +; CHECK-NEXT: [[CMP12_330:%.*]] = icmp slt i32 [[TMP43]], [[SPEC_SELECT8_3_2]] +; CHECK-NEXT: [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP43]], i32 [[SPEC_SELECT8_3_2]] ; CHECK-NEXT: [[SUB_1_3:%.*]] = sub i32 [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[TMP43:%.*]] = icmp slt i32 [[SUB_1_3]], 0 +; CHECK-NEXT: [[TMP44:%.*]] = icmp slt i32 [[SUB_1_3]], 0 ; CHECK-NEXT: [[NEG_1_3:%.*]] = sub nsw i32 0, [[SUB_1_3]] -; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[NEG_1_3]], i32 [[SUB_1_3]] -; CHECK-NEXT: [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP44]], [[SPEC_SELECT8_332]] -; CHECK-NEXT: [[TMP45:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]] -; CHECK-NEXT: [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP44]], i32 [[SPEC_SELECT8_332]] +; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[NEG_1_3]], i32 [[SUB_1_3]] +; CHECK-NEXT: [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP45]], [[SPEC_SELECT8_332]] +; CHECK-NEXT: [[TMP46:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]] +; CHECK-NEXT: [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP45]], i32 [[SPEC_SELECT8_332]] ; CHECK-NEXT: [[SUB_2_3:%.*]] = sub i32 [[TMP15]], [[TMP11]] -; CHECK-NEXT: [[TMP46:%.*]] = icmp slt i32 [[SUB_2_3]], 0 +; CHECK-NEXT: [[TMP47:%.*]] = icmp slt i32 [[SUB_2_3]], 0 ; CHECK-NEXT: [[NEG_2_3:%.*]] = sub nsw i32 0, [[SUB_2_3]] -; CHECK-NEXT: [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[NEG_2_3]], i32 [[SUB_2_3]] -; CHECK-NEXT: [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP47]], [[SPEC_SELECT8_1_3]] -; CHECK-NEXT: [[TMP48:%.*]] = or i1 [[CMP12_2_3]], [[TMP45]] -; CHECK-NEXT: [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP47]], i32 [[SPEC_SELECT8_1_3]] +; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[NEG_2_3]], i32 [[SUB_2_3]] +; CHECK-NEXT: [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP48]], [[SPEC_SELECT8_1_3]] +; CHECK-NEXT: [[TMP49:%.*]] = or i1 [[CMP12_2_3]], [[TMP46]] +; CHECK-NEXT: [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP48]], i32 [[SPEC_SELECT8_1_3]] ; CHECK-NEXT: [[SUB_3_3:%.*]] = sub i32 [[TMP15]], [[TMP12]] -; CHECK-NEXT: [[TMP49:%.*]] = icmp slt i32 [[SUB_3_3]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = icmp slt i32 [[SUB_3_3]], 0 ; CHECK-NEXT: [[NEG_3_3:%.*]] = sub nsw i32 0, [[SUB_3_3]] -; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[NEG_3_3]], i32 [[SUB_3_3]] -; CHECK-NEXT: [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP50]], [[SPEC_SELECT8_2_3]] -; CHECK-NEXT: [[TMP51:%.*]] = or i1 [[CMP12_3_3]], [[TMP48]] -; CHECK-NEXT: [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP51]], i32 3, i32 [[SPEC_SELECT_3_2]] -; CHECK-NEXT: [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP50]], i32 [[SPEC_SELECT8_2_3]] -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i32> poison, i32 [[TMP15]], i32 0 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[NEG_3_3]], i32 [[SUB_3_3]] +; CHECK-NEXT: [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP51]], [[SPEC_SELECT8_2_3]] +; CHECK-NEXT: [[TMP52:%.*]] = or i1 [[CMP12_3_3]], [[TMP49]] +; CHECK-NEXT: [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP52]], i32 3, i32 [[SPEC_SELECT_3_2]] +; CHECK-NEXT: [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP51]], i32 [[SPEC_SELECT8_2_3]] ; CHECK-NEXT: [[TMP53:%.*]] = sub <16 x i32> [[SHUFFLE2]], [[TMP13]] ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <16 x i32> [[TMP53]], i32 0 ; CHECK-NEXT: [[NEG_4:%.*]] = sub nsw i32 0, [[TMP54]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -13,11 +13,8 @@ ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1 -; CHECK-NEXT: [[T9:%.*]] = load i32, i32* [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4 -; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2 -; CHECK-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, i32* [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 3 @@ -27,10 +24,6 @@ ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -38,19 +31,28 @@ ; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T47]], i32 3 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[T8]] to <2 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -13,11 +13,8 @@ ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1 -; CHECK-NEXT: [[T9:%.*]] = load i32, i32* [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4 -; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2 -; CHECK-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, i32* [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 3 @@ -27,10 +24,6 @@ ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -38,19 +31,28 @@ ; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T47]], i32 3 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[T8]] to <2 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll @@ -19,9 +19,9 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 ; CHECK-NEXT: [[I09:%.*]] = fmul fast double [[TMP4]], undef ; CHECK-NEXT: [[I10:%.*]] = fsub fast double undef, [[I09]] -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[I10]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[I10]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = fdiv fast <2 x double> [[TMP9]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -4,20 +4,32 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]] +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]] +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]] +; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARR]], align 4 +; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]] +; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]] +; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]] +; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]] +; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]] +; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]] +; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]] +; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]] +; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]] +; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]] +; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]] +; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]] +; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]] +; CHECK-NEXT: ret i32 [[COND44]] ; entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll @@ -5,13 +5,12 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, double* [[ISEC:%.*]], i64 0 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -6,30 +6,28 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = uitofp i16 undef to float ; CHECK-NEXT: [[SUB:%.*]] = fsub float 6.553500e+04, undef -; CHECK-NEXT: br label [[BB1:%.*]] -; CHECK: bb1: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[SUB]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> ; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float> -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]] +; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> poison, double [[SUB1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float> +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] +; CHECK-NEXT: [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/reschedule.ll b/llvm/test/Transforms/SLPVectorizer/reschedule.ll --- a/llvm/test/Transforms/SLPVectorizer/reschedule.ll +++ b/llvm/test/Transforms/SLPVectorizer/reschedule.ll @@ -15,13 +15,13 @@ ; CHECK-NEXT: [[I5:%.*]] = load double, double* null, align 8 ; CHECK-NEXT: [[I6:%.*]] = load double, double* null, align 8 ; CHECK-NEXT: [[MUL746:%.*]] = fmul double 0.000000e+00, [[I6]] -; CHECK-NEXT: [[MUL747:%.*]] = fmul double 0.000000e+00, [[I5]] ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[MUL746]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[MUL701]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, <2 x double> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[MUL747]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[MUL703]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> zeroinitializer, <2 x double> [[TMP4]]) +; CHECK-NEXT: [[MUL747:%.*]] = fmul double 0.000000e+00, [[I5]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[MUL747]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[MUL703]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, <2 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> zeroinitializer, <2 x double> zeroinitializer) ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x double> zeroinitializer) ; CHECK-NEXT: br label [[FOR_COND794_PREHEADER:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -133,26 +133,26 @@ ; MAX256-NEXT: br label [[BB1:%.*]] ; MAX256: bb1: ; MAX256-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float +; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 +; MAX256-NEXT: [[SHUFFLE6:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX256-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 -; MAX256-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 -; MAX256-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX256-NEXT: [[SHUFFLE4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 +; MAX256-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE6]], [[SHUFFLE7]] ; MAX256-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX256-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP7:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE7]] +; MAX256-NEXT: [[TMP8:%.*]] = fadd <8 x float> zeroinitializer, [[TMP7]] +; MAX256-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[SHUFFLE2]], [[SHUFFLE7]] +; MAX256-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP9]] +; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE4]], [[SHUFFLE7]] ; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] ; MAX256-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX256-NEXT: i32 0, label [[BB2:%.*]] @@ -166,10 +166,10 @@ ; MAX256: bb5: ; MAX256-NEXT: br label [[BB2]] ; MAX256: bb2: -; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP8]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] +; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] +; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] +; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[TMP6]], [[BB4]] ], [ [[TMP6]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] ; MAX256-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX256-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX256-NEXT: ret void @@ -179,26 +179,26 @@ ; MAX1024-NEXT: br label [[BB1:%.*]] ; MAX1024: bb1: ; MAX1024-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float +; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 +; MAX1024-NEXT: [[SHUFFLE6:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX1024-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 -; MAX1024-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 -; MAX1024-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX1024-NEXT: [[SHUFFLE4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 +; MAX1024-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE6]], [[SHUFFLE7]] ; MAX1024-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX1024-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP7:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE7]] +; MAX1024-NEXT: [[TMP8:%.*]] = fadd <8 x float> zeroinitializer, [[TMP7]] +; MAX1024-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[SHUFFLE2]], [[SHUFFLE7]] +; MAX1024-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP9]] +; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE4]], [[SHUFFLE7]] ; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] ; MAX1024-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX1024-NEXT: i32 0, label [[BB2:%.*]] @@ -212,10 +212,10 @@ ; MAX1024: bb5: ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: -; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP8]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] +; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] +; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE7]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[TMP6]], [[BB4]] ], [ [[TMP6]], [[BB5]] ], [ [[SHUFFLE7]], [[BB1]] ] ; MAX1024-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX1024-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX1024-NEXT: ret void