diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -661,7 +661,8 @@ const unsigned E = Indices.size(); Mask.resize(E, UndefMaskElem); for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; + if (Indices[I] != E) + Mask[Indices[I]] = I; } /// \returns inserting index of InsertElement or InsertValue instruction, @@ -1925,8 +1926,8 @@ /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather - /// intrinsics for store/load)? - enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; + /// intrinsics for store/load or split entry block)? + enum EntryState { Vectorize, ScatterVectorize, NeedToGather, SplitShuffle }; EntryState State; /// Does this sequence require some shuffling? @@ -2099,6 +2100,9 @@ case NeedToGather: dbgs() << "NeedToGather\n"; break; + case SplitShuffle: + dbgs() << "SplitShuffle\n"; + break; } dbgs() << "MainOp: "; if (MainOp) @@ -2167,7 +2171,8 @@ const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { - assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || + assert(((!Bundle && (EntryState == TreeEntry::NeedToGather || + EntryState == TreeEntry::SplitShuffle)) || (Bundle && EntryState != TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"); VectorizableTree.push_back(std::make_unique(VectorizableTree)); @@ -2192,7 +2197,8 @@ Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } - if (Last->State != TreeEntry::NeedToGather) { + if (Last->State == TreeEntry::Vectorize || + Last->State == TreeEntry::ScatterVectorize) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; @@ -2207,7 +2213,7 @@ } assert((!Bundle.getValue() || Lane == VL.size()) && "Bundle and VL out of sync"); - } else { + } else if (Last->State == TreeEntry::NeedToGather) { MustGather.insert(VL.begin(), VL.end()); } @@ -3134,19 +3140,48 @@ [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == - TE->Scalars.size(); + TE->Scalars.size() || + (EI.UserTE->State == TreeEntry::SplitShuffle && + EI.UserTE->Scalars.size() == 2 * VF); }) && "All users must be of VF size."); // Update ordering of the operands with the smaller VF than the given // one. reorderReuses(TE->ReuseShuffleIndices, Mask); + continue; + } + if (TE->State == TreeEntry::SplitShuffle && + TE->Scalars.size() == VF * 2) { + // Build a full mask out of smaller mask by just duplicating it. + assert(!TE->ReorderIndices.empty() && + "Expected reordered indeces in split shuffle node."); + TE->reorderOperands(Mask); + unsigned Sz = 2 * VF; + SmallVector SplitMask(Sz); + copy(Mask, SplitMask.begin()); + transform(Mask, std::next(SplitMask.begin(), VF), + [VF](int Idx) { return Idx + VF; }); + reorderScalars(TE->Scalars, SplitMask); + SmallVector PrevOrder(Sz, Sz); + TE->ReorderIndices.swap(PrevOrder); + for (unsigned I = 0; I < Sz; ++I) { + if (SplitMask[I] == UndefMaskElem) + continue; + TE->ReorderIndices[SplitMask[I]] = PrevOrder[I]; + } } continue; } - if (TE->State == TreeEntry::Vectorize && - isa(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (TE->State == TreeEntry::SplitShuffle) { + reorderOrder(TE->ReorderIndices, Mask); + if (TE->ReorderIndices.empty()) { + TE->ReorderIndices.assign(Mask.size(), 0); + std::iota(TE->ReorderIndices.begin(), TE->ReorderIndices.end(), 0); + } + } else if (TE->State == TreeEntry::Vectorize && + isa(TE->getMainOp()) && + !TE->isAltShuffle()) { // Build correct orders for extract{element,value}, loads and // stores. reorderOrder(TE->ReorderIndices, Mask); @@ -3366,25 +3401,37 @@ } // For gathers just need to reorder its scalars. for (TreeEntry *Gather : GatherOps) { - assert(Gather->ReorderIndices.empty() && + assert((Gather->State == TreeEntry::SplitShuffle || + Gather->ReorderIndices.empty()) && "Unexpected reordering of gathers."); if (!Gather->ReuseShuffleIndices.empty()) { // Just reorder reuses indices. reorderReuses(Gather->ReuseShuffleIndices, Mask); continue; } - reorderScalars(Gather->Scalars, Mask); + if (Gather->State == TreeEntry::SplitShuffle) { + reorderOrder(Gather->ReorderIndices, Mask); + if (Gather->ReorderIndices.empty()) { + Gather->ReorderIndices.assign(Mask.size(), 0); + std::iota(Gather->ReorderIndices.begin(), + Gather->ReorderIndices.end(), 0); + } + } else { + reorderScalars(Gather->Scalars, Mask); + } OrderedEntries.remove(Gather); } // Reorder operands of the user node and set the ordering for the user // node itself. - if (Data.first->State != TreeEntry::Vectorize || - !isa( - Data.first->getMainOp()) || - Data.first->isAltShuffle()) + if (Data.first->State != TreeEntry::SplitShuffle && + (Data.first->State != TreeEntry::Vectorize || + !isa( + Data.first->getMainOp()) || + Data.first->isAltShuffle())) Data.first->reorderOperands(Mask); - if (!isa(Data.first->getMainOp()) || - Data.first->isAltShuffle()) { + if (Data.first->State != TreeEntry::SplitShuffle && + (!isa(Data.first->getMainOp()) || + Data.first->isAltShuffle())) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder); if (Data.first->ReuseShuffleIndices.empty() && @@ -3394,6 +3441,27 @@ // the graph. OrderedEntries.insert(Data.first); } + } else if (Data.first->State == TreeEntry::SplitShuffle) { + // Build a full mask out of smaller mask by just duplicating it. + unsigned VF = Mask.size(); + unsigned Sz = 2 * VF; + assert(Data.first->Scalars.size() == Sz && + "Scalars size must be twice of operands size."); + assert(!Data.first->ReorderIndices.empty() && + "Expected reordered indeces in spit shuffle node."); + Data.first->reorderOperands(Mask); + SmallVector SplitMask(Sz); + copy(Mask, SplitMask.begin()); + transform(Mask, std::next(SplitMask.begin(), VF), + [VF](int Idx) { return Idx + VF; }); + reorderScalars(Data.first->Scalars, SplitMask); + SmallVector PrevOrder(Sz, Sz); + Data.first->ReorderIndices.swap(PrevOrder); + for (unsigned I = 0; I < Sz; ++I) { + if (SplitMask[I] == UndefMaskElem) + continue; + Data.first->ReorderIndices[SplitMask[I]] = PrevOrder[I]; + } } else { reorderOrder(Data.first->ReorderIndices, Mask); } @@ -3412,7 +3480,8 @@ TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->State == TreeEntry::NeedToGather) + if (Entry->State == TreeEntry::NeedToGather || + Entry->State == TreeEntry::SplitShuffle) continue; // For each lane: @@ -3539,6 +3608,78 @@ return LoadsState::Gather; } +/// Generates key/subkey pair for the given value to provide effective sorting +/// of the values and better detection of the vectorizable values sequences. The +/// keys/subkeys can be used for better sorting of the values themselves (keys) +/// and in values subgroups (subkeys). +static std::pair generateKeySubkey( + Value *V, const TargetLibraryInfo *TLI, + function_ref LoadsSubkeyGenerator) { + hash_code Key = hash_value(V->getValueID() + 1); + hash_code SubKey = hash_value(0); + // Sort the loads by the distance between the pointers. + if (auto *LI = dyn_cast(V)) { + Key = hash_combine(hash_value(LI->getParent()), Key); + if (LI->isSimple()) + SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); + else + SubKey = hash_value(LI); + } else if (isVectorLikeInstWithConstOps(V)) { + // Sort extracts by the vector operands. + if (isa(V)) + Key = hash_value(Value::UndefValueVal + 1); + if (auto *EI = dyn_cast(V)) { + if (!isUndefVector(EI->getVectorOperand()) && + !isa(EI->getIndexOperand())) + SubKey = hash_value(EI->getVectorOperand()); + } + } else if (auto *I = dyn_cast(V)) { + // Sort other instructions just by the opcodes except for CMPInst. + // For CMP also sort by the predicate kind. + if ((isa(I) || isa(I)) && + isValidForAlternation(I->getOpcode())) { + Key = hash_value(0); + SubKey = hash_combine( + hash_value(isa(I) ? 1 : 0), + hash_value(isa(I) + ? I->getType() + : cast(I)->getOperand(0)->getType())); + } else if (auto *CI = dyn_cast(I)) { + CmpInst::Predicate Pred = CI->getPredicate(); + CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred); + SubKey = hash_combine(hash_value(I->getOpcode()), + hash_value(Pred > SwapPred ? Pred : SwapPred), + hash_value(Pred > SwapPred ? SwapPred : Pred), + hash_value(CI->getOperand(0)->getType())); + } else if (auto *Call = dyn_cast(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI); + if (isTriviallyVectorizable(ID)) + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID)); + else if (!VFDatabase(*Call).getMappings(*Call).empty()) + SubKey = hash_combine(hash_value(I->getOpcode()), + hash_value(Call->getCalledFunction())); + else + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call)); + for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) + SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End), + hash_value(Op.Tag), SubKey); + } else if (auto *Gep = dyn_cast(I)) { + if (Gep->getNumOperands() == 2 && isa(Gep->getOperand(1))) + SubKey = hash_value(Gep->getPointerOperand()); + else + SubKey = hash_value(Gep); + } else if (BinaryOperator::isIntDivRem(I->getOpcode()) && + !isa(I->getOperand(1))) { + // Do not try to vectorize instructions with potentially high cost. + SubKey = hash_value(I); + } else { + SubKey = hash_value(I->getOpcode()); + } + Key = hash_combine(hash_value(I->getParent()), Key); + } + return std::make_pair(Key, SubKey); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -3621,9 +3762,148 @@ // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || + bool IsSplat = isSplat(VL); + bool AllConstant = !IsSplat && allConstant(VL); + bool AllSameBlock = !AllConstant && allSameBlock(VL); + if (IsSplat || AllConstant || !AllSameBlock || !S.getOpcode() || (isa(S.MainOp) && !all_of(VL, isVectorLikeInstWithConstOps))) { + // Try to build split shuffle if possible. + SmallPtrSet UniqueVals; + unsigned VF = VL.size() / 2; + if (!IsSplat && !AllConstant && (!AllSameBlock || !S.getOpcode()) && + VL.size() > 2 && count_if(VL, [&UniqueVals](Value *V) { + return UniqueVals.insert(V).second; + }) > 2) { + MapVector>> + SelectedValues; + SmallVector Vectorized; + unsigned InstCnt = 0; + std::pair BestKeySubkey; + SmallDenseMap, 4> BestLoad; + // Group values by the most optimal kinds/subkinds. + for (unsigned Idx = 0, E = VL.size(); Idx < E; ++Idx) { + Value *V = VL[Idx]; + if (ScalarToTreeEntry.count(V) || MustGather.contains(V)) { + Vectorized.push_back(Idx); + continue; + } + std::pair KeySubkey = generateKeySubkey( + V, TLI, + [&SelectedValues, &BestLoad, VL, DL = DL, SE = SE, + VF, Idx](size_t Key, LoadInst *LI) { + for (const auto &LoadData : SelectedValues[Key]) { + auto &Data = BestLoad[LoadData.second.front()]; + auto *RLI = cast(VL[Data.first]); + Optional Dist = getPointersDiff( + RLI->getType(), RLI->getPointerOperand(), LI->getType(), + LI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); + if (Dist && static_cast(std::abs(*Dist)) < VF) { + if (Data.second > *Dist) { + Data.first = Idx; + Data.second = *Dist; + } + return hash_value(cast(VL[LoadData.second.front()]) + ->getPointerOperand()); + } + } + BestLoad.try_emplace(Idx, std::make_pair(Idx, 0)); + return hash_value(LI->getPointerOperand()); + }); + SmallVector &Items = + SelectedValues[KeySubkey.first][KeySubkey.second]; + Items.push_back(Idx); + if (Items.size() > InstCnt && any_of(Items, [VL](unsigned Idx) { + if (auto *EI = dyn_cast(VL[Idx])) { + if (auto *FTy = + dyn_cast(EI->getVectorOperandType())) + return FTy->getNumElements() < VL.size(); + return false; + } + return isa(VL[Idx]); + })) { + InstCnt = Items.size(); + BestKeySubkey = KeySubkey; + } + } + // Check number of unique elements. + UniqueVals.clear(); + unsigned UniqueValsCnt = count_if( + SelectedValues[BestKeySubkey.first][BestKeySubkey.second], + [&UniqueVals, VL](unsigned Idx) { + return isa(VL[Idx]) || + UniqueVals.insert(VL[Idx]).second; + }); + // Consider it only if we can split it evenly. + if ((((UniqueVals.empty() || !isa(*UniqueVals.begin())) && + InstCnt >= VF && InstCnt < VL.size()) || + InstCnt == VF) && + (UniqueValsCnt != 2 || (InstCnt == VF && UniqueValsCnt == VF))) { + LLVM_DEBUG(dbgs() << "SLP: build split shuffle block. \n"); + // How many time shall we repeat same value in the Main node. + unsigned NumRepeats = VL.size() / UniqueValsCnt; + DenseMap UniqueValsCounter; + auto SelectedValuesVector = SelectedValues.takeVector(); + SmallVector ReorderIndices(VL.size(), VL.size()); + SmallVector Operands(2); + unsigned MainCnt = 0; + unsigned SecondCnt = 0; + for (unsigned I = 0, E = SelectedValuesVector.size(); I < E; ++I) { + auto ValuesVector = SelectedValuesVector[I].second.takeVector(); + for (unsigned K = 0, N = ValuesVector.size(); K < N; ++K) { + unsigned Sz = ValuesVector[K].second.size(); + SmallVector Undefs; + for (unsigned Idx : ValuesVector[K].second) { + Value *V = VL[Idx]; + unsigned &RepeatCntRef = + UniqueValsCounter.try_emplace(V, 0).first->getSecond(); + if (Sz >= VF && MainCnt < VF && + (isa(V) || RepeatCntRef < NumRepeats)) { + ++RepeatCntRef; + if (isa(V)) { + Undefs.push_back(Idx); + } else { + ReorderIndices[MainCnt] = Idx; + Operands.front().push_back(V); + ++MainCnt; + } + } else { + ReorderIndices[SecondCnt + VF] = Idx; + ++SecondCnt; + Operands.back().push_back(V); + } + } + // Process remaining undefs. + if (MainCnt < VF) { + for (unsigned Idx : Undefs) { + Value *V = VL[Idx]; + if (Sz >= VF && MainCnt < VF) { + Operands.front().push_back(V); + ++MainCnt; + } else { + ++SecondCnt; + Operands.back().push_back(V); + } + } + } + } + } + for (unsigned Idx : Vectorized) { + Value *V = VL[Idx]; + ReorderIndices[SecondCnt + VF] = Idx; + ++SecondCnt; + Operands.back().push_back(V); + } + TreeEntry *TE = newTreeEntry(VL, TreeEntry::SplitShuffle, None, S, + UserTreeIdx, None, ReorderIndices); + + for (unsigned I = 0, E = Operands.size(); I < E; ++I) + TE->setOperand(I, Operands[I]); + for (unsigned I = 0, E = Operands.size(); I < E; ++I) + buildTree_rec(Operands[I], Depth + 1, {TE, I}); + return; + } + } LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -4862,6 +5142,24 @@ } return ReuseShuffleCost + getGatherCost(VL); } + if (E->State == TreeEntry::SplitShuffle) { + unsigned VF = VL.size(); + SmallVector Mask(VF, UndefMaskElem); + for (unsigned I = 0; I < VF; ++I) { + Value *V = VL[I]; + unsigned OpIdx = 0; + const auto *It = find(E->getOperand(OpIdx), V); + if (It == E->getOperand(OpIdx).end()) { + OpIdx = 1; + It = find(E->getOperand(OpIdx), V); + assert(It != E->getOperand(OpIdx).end() && + "Subvectors are not synced."); + } + int Idx = std::distance(E->getOperand(OpIdx).begin(), It); + Mask[I] = Idx + VF * OpIdx; + } + return TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask); + } InstructionCost CommonCost = 0; SmallVector Mask; if (!E->ReorderIndices.empty()) { @@ -6246,6 +6544,12 @@ return V; } } + auto *I = + find_if(VectorizableTree, [VL](const std::unique_ptr &TE) { + return TE->State == TreeEntry::SplitShuffle && TE->isSame(VL); + }); + if (I != VectorizableTree.end()) + return vectorizeTree(I->get()); // Check that every instruction appears once in this bundle. SmallVector ReuseShuffleIndicies; @@ -6309,6 +6613,42 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); + auto &&SetInsertPointAfterOps = [this](ArrayRef VL) { + // The last instruction in the bundle in program order. + Instruction *LastInst = nullptr; + + for (Value *V : VL) { + // If the value was vectorized, need to get the vector value for correct + // insert point. + if (const TreeEntry *TE = getTreeEntry(V)) + if (TE->VectorizedValue) + V = TE->VectorizedValue; + auto *I = dyn_cast(V); + if (!I) + continue; + if (!DT->isReachableFromEntry(I->getParent())) + continue; + if (!LastInst) { + LastInst = I; + continue; + } + if ((LastInst->getParent() != I->getParent() && + DT->dominates(LastInst->getParent(), I->getParent())) || + (LastInst->getParent() == I->getParent() && LastInst->comesBefore(I))) + LastInst = I; + } + // Set the insertion point after the last instruction in the bundle. Set + // the debug location to Front. + if (!LastInst) + return; + if (isa(LastInst)) + Builder.SetInsertPoint(LastInst->getParent(), + LastInst->getParent()->getFirstInsertionPt()); + else + Builder.SetInsertPoint(LastInst->getParent(), + std::next(LastInst->getIterator())); + Builder.SetCurrentDebugLocation(LastInst->getDebugLoc()); + }; ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, CSEBlocks); if (E->State == TreeEntry::NeedToGather) { @@ -6338,6 +6678,24 @@ E->VectorizedValue = Vec; return Vec; } + if (E->State == TreeEntry::SplitShuffle) { + SetInsertPointAfterOps(E->getOperand(0)); + Value *Op0 = vectorizeTree(E->getOperand(0)); + SetInsertPointAfterOps(E->getOperand(1)); + Value *Op1 = vectorizeTree(E->getOperand(1)); + // Fix the insert point to emit shuffles exactly after the last instruction + // in the operands. + SetInsertPointAfterOps({Op0, Op1}); + SmallVector Mask; + inversePermutation(E->ReorderIndices, Mask); + Value *Vec = Builder.CreateShuffleVector(Op0, Op1, Mask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + E->VectorizedValue = Vec; + return Vec; + } assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && @@ -7020,7 +7378,8 @@ TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->State == TreeEntry::NeedToGather) + if (Entry->State == TreeEntry::NeedToGather || + Entry->State == TreeEntry::SplitShuffle) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -10,19 +10,18 @@ ; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[ADD277]], i32 1 ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1 ; CHECK-NEXT: [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> poison, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], ; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -165,8 +165,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -209,12 +209,9 @@ ; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> -; CHECK-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP12]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP13]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -165,8 +165,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP4]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -209,12 +209,9 @@ ; CHECK-NEXT: [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> -; CHECK-NEXT: [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R72]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP12]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP13]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -4,25 +4,13 @@ define i32 @crash_reordering_undefs() { ; CHECK-LABEL: @crash_reordering_undefs( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] -; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]] -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef -; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef -; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef -; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] -; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]] -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef -; CHECK-NEXT: ret i32 [[ADD11]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> poison) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP0]], undef +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], undef +; CHECK-NEXT: [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA2]], undef +; CHECK-NEXT: [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], undef +; CHECK-NEXT: ret i32 [[OP_EXTRA4]] ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll @@ -8,17 +8,16 @@ ; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> zeroinitializer, <4 x i32> ; CHECK-NEXT: br label [[FOR_BODY92:%.*]] ; CHECK: for.body92: ; CHECK-NEXT: [[SUM_MVR_I:%.*]] = getelementptr i32, i32* undef, i32 0 ; CHECK-NEXT: [[SUM_MVR_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 2 ; CHECK-NEXT: [[SUM_MVC_I:%.*]] = getelementptr i32, i32* undef, i32 1 ; CHECK-NEXT: [[SUM_MVC_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[SUM_MVR_I]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[SUM_MVR_I]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 ; CHECK-NEXT: br label [[FOR_BODY92]] ; for.body92.preheader: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -13,21 +13,21 @@ ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> poison, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], poison +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[SHUFFLE]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> poison, [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], poison -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 -; CHECK-NEXT: [[VEC1:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 -; CHECK-NEXT: [[VEC2:%.*]] = insertelement <2 x float> [[VEC1]], float [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 -; CHECK-NEXT: [[VEC3:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3 -; CHECK-NEXT: [[VEC4:%.*]] = insertelement <2 x float> [[VEC3]], float [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], poison +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[VEC1:%.*]] = insertelement <2 x float> undef, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[VEC2:%.*]] = insertelement <2 x float> [[VEC1]], float [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[VEC3:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[VEC4:%.*]] = insertelement <2 x float> [[VEC3]], float [[TMP14]], i32 1 ; CHECK-NEXT: [[INS1:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[VEC2]], 0 ; CHECK-NEXT: [[INS2:%.*]] = insertvalue { <2 x float>, <2 x float> } [[INS1]], <2 x float> [[VEC4]], 1 ; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[INS2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -195,12 +195,11 @@ ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[ARG1]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[ARG1]], i32 3 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[ARG1]], i32 6 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG2:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP8]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> , float [[ARG2:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP8]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; CHECK: bb16: ; CHECK-NEXT: br label [[BB17:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll @@ -12,22 +12,23 @@ ; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1 ; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2 ; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3 -; CHECK-NEXT: [[V0:%.*]] = load i32, i32* [[GEP1_0]] -; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1_1]] -; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]] -; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]] -; CHECK-NEXT: [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146 -; CHECK-NEXT: [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[GEP1_0]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4 +; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], ; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42 ; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0 -; CHECK-NEXT: [[RES0:%.*]] = add nsw i32 [[V0]], [[Y0]] -; CHECK-NEXT: [[RES1:%.*]] = add nsw i32 [[V1]], [[Y1]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]] ; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]] -; CHECK-NEXT: store i32 [[RES0]], i32* [[GEP2_0]] -; CHECK-NEXT: store i32 [[RES1]], i32* [[GEP2_1]] -; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]] -; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[RES3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[RES2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[GEP2_0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -76,10 +77,10 @@ ; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1 ; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2 ; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3 -; CHECK-NEXT: [[V0:%.*]] = load i32, i32* [[GEP1_0]] -; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1_1]] -; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]] -; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]] +; CHECK-NEXT: [[V0:%.*]] = load i32, i32* [[GEP1_0]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1_1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4 +; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146 ; CHECK-NEXT: [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146 ; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42 @@ -88,10 +89,10 @@ ; CHECK-NEXT: [[RES1:%.*]] = urem i32 [[V1]], [[Y1]] ; CHECK-NEXT: [[RES2:%.*]] = urem i32 [[V2]], [[Y2]] ; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]] -; CHECK-NEXT: store i32 [[RES0]], i32* [[GEP2_0]] -; CHECK-NEXT: store i32 [[RES1]], i32* [[GEP2_1]] -; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]] -; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]] +; CHECK-NEXT: store i32 [[RES0]], i32* [[GEP2_0]], align 4 +; CHECK-NEXT: store i32 [[RES1]], i32* [[GEP2_1]], align 4 +; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]], align 4 +; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -151,37 +151,36 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] ; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP10]], i32 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP12]] = load <2 x float>, <2 x float>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], -; CHECK-NEXT: [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* +; CHECK-NEXT: [[TMP15]] = load <2 x float>, <2 x float>* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], +; CHECK-NEXT: [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -254,65 +254,39 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1 +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0 +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP14:%.*]] = add <8 x i32> [[TMP13]], +; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 -; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 -; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4 -; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1 +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP14:%.*]] = add <8 x i32> [[TMP13]], +; AVX512VL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -457,65 +431,39 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 -; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1 +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0 +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> +; AVX512F-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> +; AVX512F-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], -; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX512VL-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 @@ -730,48 +678,46 @@ ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> ; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 ; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer ; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> ; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1 +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0 +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]] +; AVX512F-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> ; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 ; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> ; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1 +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]] +; AVX512VL-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -254,65 +254,39 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1 +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0 +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP14:%.*]] = add <8 x i32> [[TMP13]], +; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 -; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 -; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4 -; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i64 1 +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <2 x i32*> [[TMP9]], i32* [[TMP8]], i64 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32*> [[TMP7]], <2 x i32*> [[TMP10]], <4 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP11]], <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP14:%.*]] = add <8 x i32> [[TMP13]], +; AVX512VL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -457,65 +431,39 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 -; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1 +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0 +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> +; AVX512F-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> +; AVX512F-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512F-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> ; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], -; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i64 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <2 x i32*> [[TMP6]], i32* [[T30]], i64 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP7]], <4 x i32> +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32*> [[TMP2]], <4 x i32*> [[TMP8]], <8 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX512VL-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 @@ -730,48 +678,46 @@ ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> ; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 ; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer ; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> ; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1 +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0 +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]] +; AVX512F-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE]], <4 x i64> ; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 ; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> ; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 -; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] -; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 1 +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <2 x float*> [[TMP9]], float* [[TMP8]], i64 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> [[TMP10]], <4 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> [[TMP11]], <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP12]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE1]], <8 x i64> +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = fdiv <8 x float> [[TMP13]], [[TMP16]] +; AVX512VL-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; ; AVX512-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -6,15 +6,13 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i64 0 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[F]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[ADD4]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0 %add = add nsw i32 %f, 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -213,20 +213,35 @@ ; logic...or a wide reduction? define i1 @logical_and_icmp_clamp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; SSE-LABEL: @logical_and_icmp_clamp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 +; SSE-NEXT: [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; SSE-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 +; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false +; SSE-NEXT: ret i1 [[S7]] +; +; AVX-LABEL: @logical_and_icmp_clamp( +; AVX-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> , <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> , i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP3]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP2]], i32 2 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP1]], i32 3 +; AVX-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[TMP5]], [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = freeze <8 x i1> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]]) +; AVX-NEXT: ret i1 [[TMP12]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -11,11 +11,12 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[T2:%.*]], align 4 ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7 -; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1 ; CHECK-NEXT: [[T9:%.*]] = load i32, i32* [[T8]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[T9]], i32 0 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6 -; CHECK-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[T10]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2 ; CHECK-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5 @@ -31,32 +32,33 @@ ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] -; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] -; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T25]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[T17]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 +; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[TMP7]], [[TMP8]] ; CHECK-NEXT: [[T40:%.*]] = mul nsw i32 [[T39]], 9633 ; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 -; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 -; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <2 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[T40]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T27]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[T40]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[T15]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3 +; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T65]], <8 x i32> [[TMP19]], <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0 +; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[TMP20]], i32 5 ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP21]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -11,11 +11,12 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[T2:%.*]], align 4 ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7 -; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1 ; CHECK-NEXT: [[T9:%.*]] = load i32, i32* [[T8]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[T9]], i32 0 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6 -; CHECK-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[T10]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2 ; CHECK-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5 @@ -31,32 +32,33 @@ ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] -; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] -; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T25]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[T17]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 +; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[TMP7]], [[TMP8]] ; CHECK-NEXT: [[T40:%.*]] = mul nsw i32 [[T39]], 9633 ; CHECK-NEXT: [[T41:%.*]] = mul nsw i32 [[T25]], 2446 ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 -; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 -; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <2 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[T40]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T27]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[T40]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[T15]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP10]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3 +; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T65]], <8 x i32> [[TMP19]], <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0 +; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[TMP20]], i32 5 ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP21]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -45,20 +45,20 @@ ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[ADD9]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -336,20 +336,20 @@ ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[SHL8]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -455,20 +455,20 @@ ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP4]], 3.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[ADD9]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +701,22 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP4]], -9.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -784,20 +784,20 @@ ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP4]], 3.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[ADD9]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -944,22 +944,22 @@ ; CHECK-LABEL: @mulfn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP4]], -9.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP18:%.*]], [[BB3:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: @@ -23,17 +23,13 @@ ; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float> -; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP18]] = phi <4 x float> [ [[TMP17]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] +; CHECK-NEXT: [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: