Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -92,6 +92,7 @@ #include #include #include +#include #include #include #include @@ -124,6 +125,11 @@ cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt + SLPThrottleBudget("slp-throttling-budget", cl::init(32), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); + static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); @@ -626,11 +632,62 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - InstructionCost getSpillCost() const; + InstructionCost getSpillCost(); + + /// \returns the cost extracting vectorized elements. + InstructionCost getExtractCost() const; + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + InstructionCost getInsertCost(); + + struct TECostComparator { + bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const { + return LHS->Cost > RHS->Cost; + } + }; + using TEVecQueue = std::priority_queue, + TECostComparator>; + + /// Find a subtree of the whole tree suitable to be vectorized. When + /// vectorizing the whole tree is not profitable, we can consider vectorizing + /// part of that tree. SLP algorithm looks to operations to vectorize starting + /// from seed instructions on the bottom toward the end of chains of + /// dependencies to the top of SLP graph, it groups potentially vectorizable + /// operations in scalar form to bundles. + /// For example: + /// + /// vector form + /// | + /// vector form vector form + /// \ / + /// vector form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// vector form + /// | + /// vector form gathered nodes + /// \ / + /// vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: + /// https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf + bool findSubTree(std::vector &Vec, unsigned &RealOpNodes, + InstructionCost TreeCost); + + /// Get raw summary of all elements of the tree. + InstructionCost getRawTreeCost(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(); + InstructionCost getTreeCost(bool TreeReduce = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -651,6 +708,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + ProposedToGather.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -659,6 +718,9 @@ } MinBWs.clear(); InstrElementSize.clear(); + NoCallInst = true; + RawTreeCost = 0; + IsCostSumReady = false; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -821,6 +883,9 @@ /// may not be necessary. bool isLoadCombineCandidate() const; + /// Cut the tree to make it partially vectorizable. + void cutTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1637,6 +1702,9 @@ /// Does this entry require reordering? SmallVector ReorderIndices; + /// Cost of this tree entry. + InstructionCost Cost = 0; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1649,6 +1717,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// Use of this entry. + TinyPtrVector UseEntries; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1718,6 +1789,21 @@ getAltOpcode() == CheckedOpcode); } + bool isRealOp(unsigned Opcode) const { + switch (Opcode) { + case Instruction::ShuffleVector: + case Instruction::ExtractValue: + case Instruction::ExtractElement: + case Instruction::InsertElement: + case Instruction::Load: + case Instruction::PHI: + case Instruction::Store: + return false; + default: + return true; + } + } + /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is /// \p OpValue. @@ -1750,6 +1836,18 @@ return AltOp ? AltOp->getOpcode() : 0; } + /// The entry contains binary operations, arithmetical or call and etc. + /// not just data-related operations like load, store and others. + bool isRealOpEntry() const { + unsigned Opcode = getOpcode(); + unsigned AltOpcode = getAltOpcode(); + if (Opcode && isRealOp(Opcode)) + return true; + if (AltOpcode && isRealOp(AltOpcode)) + return true; + return false; + } + /// Update operations state of this entry if reorder occurred. bool updateStateIfReorder() { if (ReorderIndices.empty()) @@ -1881,8 +1979,10 @@ MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last); + } return Last; } @@ -1932,6 +2032,9 @@ }; using UserList = SmallVector; + /// \returns the cost of extracting the vectorized elements. + InstructionCost getExtractOperationCost(const ExternalUser &EU) const; + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1982,6 +2085,25 @@ /// after vectorization. UserList ExternalUses; + /// Tree entries that should not be vectorized due to throttling. + SmallPtrSet ProposedToGather; + + /// Raw cost of all elemts in the tree. + InstructionCost RawTreeCost = 0; + + /// Indicate that no CallInst found in the tree and we don't need to + /// calculate spill cost. + bool NoCallInst = true; + + /// True, if we have calucalte tree cost for the tree. + bool IsCostSumReady = false; + + /// Current operations width to vectorize. + unsigned BundleWidth = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -2325,6 +2447,9 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// Make the scheduling region smaller. + void reduceSchedulingRegion(Instruction *Start, Instruction *End); + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -2387,6 +2512,9 @@ /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; @@ -2601,7 +2729,7 @@ buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -2634,6 +2762,7 @@ // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. + InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); if (UseScalar != U || UseEntry->State == TreeEntry::ScatterVectorize || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { @@ -3396,6 +3525,50 @@ } } +void BoUpSLP::cutTree() { + SmallVector VecNodes; + + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::Vectorize && + Entry->State != TreeEntry::ScatterVectorize) + continue; + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + TreeEntry *UserTE = getTreeEntry(U); + if (!UserTE || ProposedToGather.count(UserTE) == 0) + continue; + // Ignore users in the user ignore list. + auto *UserInst = dyn_cast(U); + if (!UserInst) + continue; + + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + ExternalUses.emplace_back(Scalar, U, Lane); + } + } + } + // Canceling unprofitable elements. + for (TreeEntry *Entry : ProposedToGather) { + for (Value *V : Entry->Scalars) { + ScalarToTreeEntry.erase(V); +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); +#endif + } + } +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; @@ -3707,7 +3880,7 @@ SmallVector Entries; Optional Shuffle = isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { + if (Shuffle.hasValue() && ProposedToGather.count(E) == 0) { if (ShuffleVectorInst::isIdentityMask(Mask)) { LLVM_DEBUG( dbgs() @@ -4246,12 +4419,11 @@ return true; } -InstructionCost BoUpSLP::getSpillCost() const { +InstructionCost BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); InstructionCost Cost = 0; SmallPtrSet LiveValues; @@ -4316,6 +4488,7 @@ } if (NumCalls) { + NoCallInst = false; SmallVector V; for (auto *II : LiveValues) { auto *ScalarTy = II->getType(); @@ -4332,76 +4505,229 @@ return Cost; } -InstructionCost BoUpSLP::getTreeCost() { - InstructionCost Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); - - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); +InstructionCost BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const { + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + // No extract cost for vector "scalar" + if (isa(EU.Scalar->getType())) + return 0; - InstructionCost C = getEntryCost(&TE); - Cost += C; - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for bundle that starts with " << *TE.Scalars[0] - << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - } + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + Value *ScalarRoot = VectorizableTree.front()->Scalars[0]; + + auto It = MinBWs.find(ScalarRoot); + if (It != MinBWs.end()) { + uint64_t Width = It->second.first; + bool Signed = It->second.second; + auto *MinTy = IntegerType::get(F->getContext(), Width); + unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt; + VecTy = FixedVectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy, + EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} - SmallPtrSet ExtractCostCalculated; +InstructionCost BoUpSLP::getExtractCost() const { InstructionCost ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { + SmallPtrSet ExtractCostCalculated; + // Consider the possibility of extracting vectorized + // values for canceled elements use. + for (TreeEntry *Entry : ProposedToGather) { + for (Value *V : Entry->Scalars) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = InternalTreeUses.find(V); + if (It != InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + ExtractCost += getExtractOperationCost(IU); + } + } + } + for (const ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) continue; - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +InstructionCost BoUpSLP::getInsertCost() { + InstructionCost InsertCost = 0; + for (TreeEntry *Entry : ProposedToGather) { + // Avoid already vectorized TreeEntries, it is already in a vector form and + // we don't need to gather those operations or nodes that were once + // considered to be vectorized but now don't have any direct relations + // to vectorizable nodes. + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + if (llvm::any_of(Inst->users(), [this](User *Op) { + if (const TreeEntry *UserTE = getTreeEntry(Op)) { + return (ProposedToGather.count(UserTE) != 0); + } + return false; + })) { + InsertCost += getEntryCost(Entry); + break; + } + } + } + return InsertCost; +} + +bool BoUpSLP::findSubTree(std::vector &Vec, unsigned &RealOpNodes, + InstructionCost TreeCost) { + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + // Ignore any non-vectoriable entries, entries with low cost, + // or root entry. + if (Entry->State == TreeEntry::NeedToGather) continue; - // No extract cost for vector "scalar" - if (isa(EU.Scalar->getType())) + if (Entry->isRealOpEntry()) + RealOpNodes++; + + if (Entry->Cost <= 0 || !Entry->Idx) continue; + Vec.push_back(Entry); + } + InstructionCost Sum = 0; + for (TreeEntry *Entry : Vec) + Sum += Entry->Cost; + // Avoid reducing the tree if there is no potential room to reduce. + if ((TreeCost - Sum) >= -SLPCostThreshold) + return false; - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = FixedVectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); - } + return (Vec.size() > 0); +} + +InstructionCost BoUpSLP::getRawTreeCost() { + InstructionCost CostSum = 0; + BundleWidth = VectorizableTree.front()->Scalars.size(); + LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " + << VectorizableTree.size() << ".\n"); + + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); + + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost + << " for bundle that starts with " << *TE.Scalars[0] + << ".\n"); + CostSum += TE.Cost; + LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n"); + } + + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::ScatterVectorize) + continue; + InstructionCost GatherCost = 0; + for (TreeEntry *Gather : TE->UseEntries) + if (Gather->State != TreeEntry::Vectorize && + Gather->State != TreeEntry::ScatterVectorize) + GatherCost += Gather->Cost; + TE->Cost += GatherCost; } + return CostSum; +} - InstructionCost SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; +InstructionCost BoUpSLP::getTreeCost(bool TreeReduce) { + InstructionCost CostSum; + if (!IsCostSumReady) { + CostSum = getRawTreeCost(); + RawTreeCost = CostSum; + } else { + CostSum = RawTreeCost; + } + + InstructionCost ExtractCost = getExtractCost(); + InstructionCost SpillCost = 0; + if (!NoCallInst || !IsCostSumReady) + SpillCost = getSpillCost(); + assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost"); + if (!IsCostSumReady) + IsCostSumReady = true; + InstructionCost InsertCost = getInsertCost(); + InstructionCost Cost = CostSum + ExtractCost + SpillCost + InsertCost; + InstructionCost FullCost = Cost; #ifndef NDEBUG SmallString<256> Str; - { - raw_svector_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; - } + raw_svector_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Insert Cost = " << InsertCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; LLVM_DEBUG(dbgs() << Str); if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); #endif + if (TreeReduce && Cost >= -SLPCostThreshold) { + std::vector Vec; + unsigned RealOpNodes = 0; + if (!findSubTree(Vec, RealOpNodes, Cost)) + return Cost; + TEVecQueue Queue(Vec.begin(), Vec.end()); + unsigned NodeCounter = 0; + bool HasRealOps = (RealOpNodes > 0); - return Cost; + while (!Queue.empty()) { + TreeEntry *T = Queue.top(); + Queue.pop(); + NodeCounter++; + + if (!NoCallInst && NodeCounter > SLPThrottleBudget) + break; + + ProposedToGather.insert(T); + T->State = TreeEntry::NeedToGather; + // If an original tree contained some real operations like binary, + // arithmetical, calls which were proposed to vectorize then we don't + // want to reduce this tree to just load and store operations in + // vectorized form. + if (HasRealOps) { + if (T->isRealOpEntry()) + RealOpNodes--; + if (RealOpNodes == 0) + break; + } + for (Value *V : T->Scalars) { + MustGather.insert(V); + ExternalUses.erase( + llvm::remove_if(ExternalUses, + [V](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + CostSum -= T->Cost; + ExtractCost = getExtractCost(); + if (!NoCallInst) + SpillCost = getSpillCost(); + assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost"); + InsertCost = getInsertCost(); + Cost = CostSum + ExtractCost + SpillCost + InsertCost; + if (Cost < -SLPCostThreshold && !isTreeTinyAndNotFullyVectorizable() && + (HasRealOps || + (VectorizableTree[0]->State == TreeEntry::Vectorize && + VectorizableTree[1]->State == TreeEntry::Vectorize))) { + cutTree(); + return Cost; + } + } + ProposedToGather.clear(); + } + return FullCost; } Optional @@ -5280,12 +5606,25 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!ProposedToGather.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if ((Entry->State == TreeEntry::Vectorize || + Entry->State == TreeEntry::ScatterVectorize) && + !Entry->VectorizedValue) + vectorizeTree(Entry); + } + // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. @@ -5418,7 +5757,9 @@ #ifndef NDEBUG Type *Ty = Scalar->getType(); - if (!Ty->isVoidTy()) { + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!Ty->isVoidTy() && ProposedToGather.empty()) { for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); @@ -5643,6 +5984,7 @@ BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; + BundleMember->TE = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; if (BundleMember->UnscheduledDepsInBundle == 0) { ReadyInsts.insert(BundleMember); @@ -5914,6 +6256,85 @@ ReadyInsts.clear(); } +void BoUpSLP::BlockScheduling::reduceSchedulingRegion(Instruction *Start, + Instruction *End) { + if (Start) + ScheduleStart = Start; + if (End) + ScheduleEnd = End; +} + +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + SmallPtrSet Gathers; + SmallPtrSet Reduced; + Instruction *Start = nullptr; + + // We can reduce the number of instructions to be considered for scheduling, + // after cutting the tree. Here we shrink the scheduling area from the top, + // consecutively, untill we encounter the required instruction. There might be + // unnecessary NeedToGather nodes with the relationship only to other + // NeedToGather nodes and unmap instructions in chains, we could safely + // delete those. + for (std::unique_ptr &TEPtr : reverse(VectorizableTree)) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::NeedToGather || !TE->getOpcode() || + TE->getMainOp()->getParent() != BS->BB) + continue; + for (const EdgeInfo &EI : TE->UserTreeIndices) { + if (EI.UserTE && (EI.UserTE->State != TreeEntry::NeedToGather)) { + auto InstructionsOnly = + make_filter_range(TE->Scalars, Instruction::classof); + for (Value *V : InstructionsOnly) + Gathers.insert(cast(V)); + break; + } + } + } + + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (!getTreeEntry(I) && !Gathers.count(I)) { + Reduced.insert(I); + } else { + Start = I; + break; + } + } + + BS->reduceSchedulingRegion(Start, nullptr); + + for (TreeEntry *Entry : ProposedToGather) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + SD->IsScheduled = false; + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + + if (Reduced.size()) { + for (Instruction *I : Reduced) { + ScheduleData *SD = BS->getScheduleData(I); + if (SD) + SD->SchedulingRegionID = -1; + } + } + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -6444,7 +6865,7 @@ R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(true); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -6649,6 +7070,7 @@ // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL); + if (!S.getOpcode()) return false; @@ -6739,7 +7161,7 @@ continue; R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(true); CandidateFound = true; MinCost = std::min(MinCost, Cost); Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -172,11 +172,15 @@ ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 ; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[TMP0]], <2 x i8> undef, <4 x i32> +; MAX-COST-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> poison, <4 x i8> [[TMP1]], <4 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[P4]], i32 2 +; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[P6]], i32 3 +; MAX-COST-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; MAX-COST-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1 +; MAX-COST-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0 ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -188,19 +192,17 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <4 x i32> -; MAX-COST-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> poison, <4 x i1> [[TMP2]], <4 x i32> -; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP4]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> -; MAX-COST-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 +; MAX-COST-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 +; MAX-COST-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; MAX-COST-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[P27]] +; MAX-COST-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP15]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -183,26 +183,24 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1_1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP3_32:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3_32]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -183,26 +183,24 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1_1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP3_32:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3_32]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll @@ -24,53 +24,53 @@ ; CHECK-NEXT: [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[CONV]], -128 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP5]] to i32 -; CHECK-NEXT: [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128 -; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1 -; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 128, [[CONV]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]] -; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1 -; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]] -; CHECK-NEXT: [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i32> , [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP8]], <2 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]] -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1 -; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP14]] to i32 ; CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]] ; CHECK-NEXT: [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8 ; CHECK-NEXT: store i8 [[CONV17]], i8* [[ADD_PTR]], align 1 ; CHECK-NEXT: [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]] -; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP15]], 0 ; CHECK-NEXT: [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8 ; CHECK-NEXT: store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1 ; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128 -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32 -; CHECK-NEXT: [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128 -; CHECK-NEXT: [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1 -; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]] -; CHECK-NEXT: [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]] -; CHECK-NEXT: [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1 -; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]] -; CHECK-NEXT: [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3_1]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[CONV_1]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw <2 x i32> [[TMP19]], +; CHECK-NEXT: [[TMP21:%.*]] = icmp sgt <2 x i32> [[TMP20]], +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <2 x i32> , [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <2 x i1> [[TMP21]], <2 x i32> [[TMP20]], <2 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] ; CHECK-NEXT: [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP26]] to i32 ; CHECK-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]] ; CHECK-NEXT: [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8 ; CHECK-NEXT: store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP27]], 0 ; CHECK-NEXT: [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8 ; CHECK-NEXT: store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1 ; CHECK-NEXT: [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]] Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -104,15 +104,16 @@ ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 ; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 ; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T5]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[T111:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T111]], float [[T13]], i32 2 ; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 ; CHECK-NEXT: ret <4 x float> [[T15]] ; Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -104,15 +104,16 @@ ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 ; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 ; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T5]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[T111:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T111]], float [[T13]], i32 2 ; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 ; CHECK-NEXT: ret <4 x float> [[T15]] ; Index: llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -20,26 +20,26 @@ ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[A4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 -; CHECK-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 -; CHECK-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] -; CHECK-NEXT: ret i8 [[TMP_8]] +; CHECK-NEXT: [[A5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] +; CHECK-NEXT: [[A6:%.*]] = load i8, i8* [[A4]], align 1 +; CHECK-NEXT: [[A7:%.*]] = load i8, i8* [[A5]], align 1 +; CHECK-NEXT: [[A8:%.*]] = add i8 [[A6]], [[A7]] +; CHECK-NEXT: ret i8 [[A8]] ; entry: - %tmp_0 = zext i8 %v0 to i32 - %tmp_1 = zext i8 %v1 to i32 - %tmp_2 = or i32 %tmp_0, 1 - %tmp_3 = or i32 %tmp_1, 1 - %tmp_4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp_2 - %tmp_5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp_3 - %tmp_6 = load i8, i8* %tmp_4 - %tmp_7 = load i8, i8* %tmp_5 - %tmp_8 = add i8 %tmp_6, %tmp_7 - ret i8 %tmp_8 + %a0 = zext i8 %v0 to i32 + %a1 = zext i8 %v1 to i32 + %a2 = or i32 %a0, 1 + %a3 = or i32 %a1, 1 + %a4 = getelementptr inbounds i8, i8* %ptr, i32 %a2 + %a5 = getelementptr inbounds i8, i8* %ptr, i32 %a3 + %a6 = load i8, i8* %a4 + %a7 = load i8, i8* %a5 + %a8 = add i8 %a6, %a7 + ret i8 %a8 } ; When computing minimum sizes, if we cannot prove the sign bit is zero, we @@ -58,16 +58,22 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { ; SSE-LABEL: @PR31243_sext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 -; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 -; SSE-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] -; SSE-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 -; SSE-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] -; SSE-NEXT: ret i8 [[TMP8]] +; SSE-NEXT: [[B0:%.*]] = sext i8 [[V0:%.*]] to i32 +; SSE-NEXT: [[B1:%.*]] = sext i8 [[V1:%.*]] to i32 +; SSE-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[B0]], i32 0 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[B1]], i32 1 +; SSE-NEXT: [[TMP2:%.*]] = trunc <2 x i32> [[TMP1]] to <2 x i16> +; SSE-NEXT: [[TMP3:%.*]] = or <2 x i16> [[TMP2]], +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 +; SSE-NEXT: [[B4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 +; SSE-NEXT: [[B5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; SSE-NEXT: [[B6:%.*]] = load i8, i8* [[B4]], align 1 +; SSE-NEXT: [[B7:%.*]] = load i8, i8* [[B5]], align 1 +; SSE-NEXT: [[B8:%.*]] = add i8 [[B6]], [[B7]] +; SSE-NEXT: ret i8 [[B8]] ; ; AVX-LABEL: @PR31243_sext( ; AVX-NEXT: entry: @@ -77,24 +83,41 @@ ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; AVX-NEXT: [[B4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] ; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 ; AVX-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] -; AVX-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 -; AVX-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] -; AVX-NEXT: ret i8 [[TMP8]] +; AVX-NEXT: [[B5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; AVX-NEXT: [[B6:%.*]] = load i8, i8* [[B4]], align 1 +; AVX-NEXT: [[B7:%.*]] = load i8, i8* [[B5]], align 1 +; AVX-NEXT: [[B8:%.*]] = add i8 [[B6]], [[B7]] +; AVX-NEXT: ret i8 [[B8]] +; +; AVX2-LABEL: @PR31243_sext( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 +; AVX2-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; AVX2-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 +; AVX2-NEXT: [[B4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 +; AVX2-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 +; AVX2-NEXT: [[B5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; AVX2-NEXT: [[B6:%.*]] = load i8, i8* [[B4]], align 1 +; AVX2-NEXT: [[B7:%.*]] = load i8, i8* [[B5]], align 1 +; AVX2-NEXT: [[B8:%.*]] = add i8 [[B6]], [[B7]] +; AVX2-NEXT: ret i8 [[B8]] ; entry: - %tmp0 = sext i8 %v0 to i32 - %tmp1 = sext i8 %v1 to i32 - %tmp2 = or i32 %tmp0, 1 - %tmp3 = or i32 %tmp1, 1 - %tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2 - %tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3 - %tmp6 = load i8, i8* %tmp4 - %tmp7 = load i8, i8* %tmp5 - %tmp8 = add i8 %tmp6, %tmp7 - ret i8 %tmp8 + %b0 = sext i8 %v0 to i32 + %b1 = sext i8 %v1 to i32 + %b2 = or i32 %b0, 1 + %b3 = or i32 %b1, 1 + %b4 = getelementptr inbounds i8, i8* %ptr, i32 %b2 + %b5 = getelementptr inbounds i8, i8* %ptr, i32 %b3 + %b6 = load i8, i8* %b4 + %b7 = load i8, i8* %b5 + %b8 = add i8 %b6, %b7 + ret i8 %b8 } Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll +++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -60,35 +60,34 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ ; AVX1-LABEL: @powof2div_nonuniform( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 ; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 ; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 -; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 ; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 +; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; AVX1-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 2 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4 +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8 +; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16 ; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 +; AVX1-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1 +; AVX1-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @powof2div_nonuniform( Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX2 %class.1 = type { %class.2 } %class.2 = type { %"class.3" } @@ -47,6 +47,24 @@ ; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 ; AVX-NEXT: ret void ; +; AVX2-LABEL: @_ZN1C10SwitchModeEv( +; AVX2-NEXT: for.body.lr.ph.i: +; AVX2-NEXT: [[OR_1:%.*]] = or i64 undef, 1 +; AVX2-NEXT: store i64 [[OR_1]], i64* undef, align 8 +; AVX2-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 +; AVX2-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 +; AVX2-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; AVX2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 +; AVX2-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 +; AVX2-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; AVX2-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 +; AVX2-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; AVX2-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; AVX2-NEXT: ret void +; for.body.lr.ph.i: %or.1 = or i64 undef, 1 store i64 %or.1, i64* undef, align 8 @@ -70,31 +88,28 @@ ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 ; SSE-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 -; SSE-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 ; SSE-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 -; SSE-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 -; SSE-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 -; SSE-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[AND_1]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[AND]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 -; SSE-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 -; SSE-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] -; SSE-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 -; SSE-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 +; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 -; SSE-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 -; SSE-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 -; SSE-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 -; SSE-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 -; SSE-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] -; SSE-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], ; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 -; SSE-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 -; SSE-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] -; SSE-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 +; SSE-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 +; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], +; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( @@ -123,6 +138,32 @@ ; AVX-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; AVX-NEXT: ret void ; +; AVX2-LABEL: @pr35497( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; AVX2-NEXT: [[ADD:%.*]] = add i64 undef, undef +; AVX2-NEXT: store i64 [[ADD]], i64* undef, align 1 +; AVX2-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 +; AVX2-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], +; AVX2-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], +; AVX2-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; AVX2-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer +; AVX2-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; AVX2-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], +; AVX2-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; AVX2-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 +; AVX2-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], +; AVX2-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; AVX2-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; AVX2-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 +; AVX2-NEXT: ret void +; entry: %0 = load i64, i64* undef, align 1 %and = shl i64 %0, 2 Index: llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i32( ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]] @@ -16,7 +16,7 @@ ; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; %4 = load i32, i32* %0, align 4, !tbaa !2 @@ -52,7 +52,7 @@ define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i8( ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP0:%.*]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, [[TBAA4:!tbaa !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, !tbaa [[TBAA4:![0-9]+]] ; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer @@ -62,7 +62,7 @@ ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP10]], <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, [[TBAA4]] +; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, !tbaa [[TBAA4]] ; CHECK-NEXT: ret void ; %4 = load i8, i8* %0, align 1, !tbaa !6 @@ -106,86 +106,111 @@ define void @store_i64(i64* nocapture %0, i32 %1, i32 %2) { ; SSE-LABEL: @store_i64( ; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]] +; SSE-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] ; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] ; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 ; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 ; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 ; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; SSE-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]] +; SSE-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 -; SSE-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]] +; SSE-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] ; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 ; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 ; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 ; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 ; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; SSE-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]] +; SSE-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; SSE-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]] +; SSE-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] ; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 ; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 ; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 ; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 ; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; SSE-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]] +; SSE-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 -; SSE-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]] +; SSE-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] ; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 ; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 ; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 ; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 ; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; SSE-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]] +; SSE-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: ret void ; +; SSE42-LABEL: @store_i64( +; SSE42-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; SSE42-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE42-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] +; SSE42-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 +; SSE42-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; SSE42-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 +; SSE42-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 +; SSE42-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 +; SSE42-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 +; SSE42-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] +; SSE42-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 +; SSE42-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; SSE42-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 +; SSE42-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 +; SSE42-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 +; SSE42-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 +; SSE42-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] +; SSE42-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 +; SSE42-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; SSE42-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 +; SSE42-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 +; SSE42-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 +; SSE42-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 +; SSE42-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] +; SSE42-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 +; SSE42-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; SSE42-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 +; SSE42-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 +; SSE42-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 +; SSE42-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: ret void +; ; AVX1-LABEL: @store_i64( ; AVX1-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX1-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]] -; AVX1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 -; AVX1-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; AVX1-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 -; AVX1-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 -; AVX1-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; AVX1-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 -; AVX1-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]] +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], [[TMP4]] +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], [[TMP4]] +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2 +; AVX1-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], [[TMP4]] +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3 ; AVX1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] -; AVX1-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 -; AVX1-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; AVX1-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 -; AVX1-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 -; AVX1-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; AVX1-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; AVX1-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] -; AVX1-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 -; AVX1-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 -; AVX1-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 -; AVX1-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 -; AVX1-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; AVX1-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 -; AVX1-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] -; AVX1-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 -; AVX1-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 -; AVX1-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 -; AVX1-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 -; AVX1-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; AVX1-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]] +; AVX1-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP10]], i32 1 +; AVX1-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP12]], i32 2 +; AVX1-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3 +; AVX1-NEXT: [[TMP19:%.*]] = lshr <4 x i64> [[TMP18]], +; AVX1-NEXT: [[TMP20:%.*]] = trunc <4 x i64> [[TMP19]] to <4 x i32> +; AVX1-NEXT: [[TMP21:%.*]] = icmp ult <4 x i32> [[TMP20]], +; AVX1-NEXT: [[TMP22:%.*]] = and <4 x i64> [[TMP19]], +; AVX1-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP21]], <4 x i64> [[TMP22]], <4 x i64> +; AVX1-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* +; AVX1-NEXT: store <4 x i64> [[TMP23]], <4 x i64>* [[TMP24]], align 8, !tbaa [[TBAA5]] ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @store_i64( ; AVX2-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, [[TBAA5:!tbaa !.*]] +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] ; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer ; AVX2-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]] @@ -195,8 +220,24 @@ ; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i64> [[TMP10]], ; AVX2-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* -; AVX2-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, [[TBAA5]] +; AVX2-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]] ; AVX2-NEXT: ret void +; +; AVX512-LABEL: @store_i64( +; AVX512-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX512-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]] +; AVX512-NEXT: [[TMP10:%.*]] = lshr <4 x i64> [[TMP9]], +; AVX512-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = icmp ult <4 x i32> [[TMP11]], +; AVX512-NEXT: [[TMP13:%.*]] = and <4 x i64> [[TMP10]], +; AVX512-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* +; AVX512-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]] +; AVX512-NEXT: ret void ; %4 = zext i32 %1 to i64 %5 = load i64, i64* %0, align 8, !tbaa !7 Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -1,27 +1,95 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { -; CHECK-LABEL: @gather_load( -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: ret void +; SSE-LABEL: @gather_load( +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: ret void +; +; AVX-LABEL: @gather_load( +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: ret void +; +; AVX2-LABEL: @gather_load( +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @gather_load( +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX512-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX512-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX512-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX512VL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %1, align 4, !tbaa !2 @@ -67,25 +135,15 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_2( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1:%.*]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> undef, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> +; AVX-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP6]], <4 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( @@ -107,6 +165,16 @@ ; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* ; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_2( +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], +; AVX512VL-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %3, align 4, !tbaa !2 @@ -134,143 +202,158 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_3( ; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; SSE-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; SSE-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; SSE-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; SSE-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; SSE-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; SSE-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; SSE-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; SSE-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; SSE-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; SSE-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; SSE-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; SSE-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; SSE-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; SSE-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; SSE-NEXT: store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; SSE-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 2 +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; SSE-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; SSE-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP27:%.*]] = add i32 [[TMP26]], 3 +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; SSE-NEXT: store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; SSE-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP31:%.*]] = add i32 [[TMP30]], 4 +; SSE-NEXT: store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( ; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX2-NEXT: [[TMP13:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 0 ; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 1 +; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 2 ; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 3 +; AVX2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP7]], i32 2 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> undef, i32 [[TMP14]], i32 0 +; AVX2-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP16]], i32 1 +; AVX2-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP18]], i32 2 +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 3 +; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP28]], <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = add <8 x i32> [[TMP29]], +; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP30]], <8 x i32>* [[TMP31]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_3( ; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX512-NEXT: [[TMP13:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 0 ; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512-NEXT: [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 1 +; AVX512-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP17:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 2 ; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP19:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 3 +; AVX512-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX512-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP7]], i32 2 +; AVX512-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP9]], i32 3 +; AVX512-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> undef, i32 [[TMP14]], i32 0 +; AVX512-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP16]], i32 1 +; AVX512-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP18]], i32 2 +; AVX512-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 3 +; AVX512-NEXT: [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP28]], <8 x i32> +; AVX512-NEXT: [[TMP30:%.*]] = add <8 x i32> [[TMP29]], +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP30]], <8 x i32>* [[TMP31]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_3( +; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512VL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512VL-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX512VL-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP5]], i32 1 +; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP7]], i32 2 +; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP9]], i32 3 +; AVX512VL-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP18]], <8 x i32> +; AVX512VL-NEXT: [[TMP20:%.*]] = add <8 x i32> [[TMP19]], +; AVX512VL-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 %4 = add i32 %3, 1 @@ -315,13 +398,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) { ; SSE-LABEL: @gather_load_4( -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 @@ -337,18 +417,17 @@ ; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; SSE-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; SSE-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; SSE-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], ; SSE-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 ; SSE-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 ; SSE-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 ; SSE-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; SSE-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] @@ -356,19 +435,12 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] @@ -378,81 +450,107 @@ ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> ; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 0 +; AVX2-NEXT: [[T19:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 1 +; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 2 +; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 3 +; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T7]], i32 1 +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[T11]], i32 2 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[T15]], i32 3 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> undef, i32 [[T19]], i32 0 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T23]], i32 1 +; AVX2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T27]], i32 2 +; AVX2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[T31]], i32 3 +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP15]], <8 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP16]], +; AVX2-NEXT: [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX512-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> ; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 0 +; AVX512-NEXT: [[T19:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 1 +; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 2 +; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 3 +; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX512-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T7]], i32 1 +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[T11]], i32 2 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[T15]], i32 3 +; AVX512-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> undef, i32 [[T19]], i32 0 +; AVX512-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T23]], i32 1 +; AVX512-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T27]], i32 2 +; AVX512-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[T31]], i32 3 +; AVX512-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP15]], <8 x i32> +; AVX512-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP16]], +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_4( +; AVX512VL-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512VL-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512VL-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T7]], i32 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T11]], i32 2 +; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T15]], i32 3 +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX512VL-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11 @@ -595,6 +693,29 @@ ; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_div( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> +; AVX512VL-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] +; AVX512VL-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 %4 = getelementptr inbounds float, float* %1, i64 4 Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -1,27 +1,95 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { -; CHECK-LABEL: @gather_load( -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: ret void +; SSE-LABEL: @gather_load( +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: ret void +; +; AVX-LABEL: @gather_load( +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: ret void +; +; AVX2-LABEL: @gather_load( +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @gather_load( +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX512-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX512-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX512-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], +; AVX512VL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %1, align 4, !tbaa !2 @@ -67,25 +135,15 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_2( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1:%.*]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> undef, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> +; AVX-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP4]], <2 x i64> +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32*> [[TMP5]], <2 x i32*> [[TMP6]], <4 x i32> +; AVX-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( @@ -107,6 +165,16 @@ ; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* ; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_2( +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], +; AVX512VL-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %3, align 4, !tbaa !2 @@ -134,143 +202,158 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_3( ; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; SSE-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; SSE-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; SSE-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; SSE-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; SSE-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; SSE-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; SSE-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; SSE-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; SSE-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; SSE-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; SSE-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; SSE-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; SSE-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; SSE-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; SSE-NEXT: store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; SSE-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 2 +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; SSE-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; SSE-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP27:%.*]] = add i32 [[TMP26]], 3 +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; SSE-NEXT: store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; SSE-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP31:%.*]] = add i32 [[TMP30]], 4 +; SSE-NEXT: store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( ; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX2-NEXT: [[TMP13:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 0 ; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 1 +; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 2 ; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 3 +; AVX2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP7]], i32 2 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> undef, i32 [[TMP14]], i32 0 +; AVX2-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP16]], i32 1 +; AVX2-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP18]], i32 2 +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 3 +; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP28]], <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = add <8 x i32> [[TMP29]], +; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP30]], <8 x i32>* [[TMP31]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_3( ; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX512-NEXT: [[TMP13:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 0 ; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512-NEXT: [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 1 +; AVX512-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP17:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 2 ; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP19:%.*]] = extractelement <4 x i32*> [[TMP12]], i32 3 +; AVX512-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX512-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP7]], i32 2 +; AVX512-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP9]], i32 3 +; AVX512-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> undef, i32 [[TMP14]], i32 0 +; AVX512-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP16]], i32 1 +; AVX512-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP18]], i32 2 +; AVX512-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 3 +; AVX512-NEXT: [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP28]], <8 x i32> +; AVX512-NEXT: [[TMP30:%.*]] = add <8 x i32> [[TMP29]], +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP30]], <8 x i32>* [[TMP31]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_3( +; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512VL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512VL-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX512VL-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[TMP5]], i32 1 +; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP7]], i32 2 +; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP9]], i32 3 +; AVX512VL-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP18]], <8 x i32> +; AVX512VL-NEXT: [[TMP20:%.*]] = add <8 x i32> [[TMP19]], +; AVX512VL-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 %4 = add i32 %3, 1 @@ -315,13 +398,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) { ; SSE-LABEL: @gather_load_4( -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 @@ -337,18 +417,17 @@ ; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; SSE-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; SSE-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; SSE-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], ; SSE-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 ; SSE-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 ; SSE-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 ; SSE-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; SSE-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] @@ -356,19 +435,12 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] @@ -378,81 +450,107 @@ ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> ; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 0 +; AVX2-NEXT: [[T19:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 1 +; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 2 +; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 3 +; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T7]], i32 1 +; AVX2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[T11]], i32 2 +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[T15]], i32 3 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> undef, i32 [[T19]], i32 0 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T23]], i32 1 +; AVX2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T27]], i32 2 +; AVX2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[T31]], i32 3 +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP15]], <8 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP16]], +; AVX2-NEXT: [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX512-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> ; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 0 +; AVX512-NEXT: [[T19:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 1 +; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 2 +; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32*> [[TMP3]], i32 3 +; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX512-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T7]], i32 1 +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[T11]], i32 2 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[T15]], i32 3 +; AVX512-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> undef, i32 [[T19]], i32 0 +; AVX512-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T23]], i32 1 +; AVX512-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T27]], i32 2 +; AVX512-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[T31]], i32 3 +; AVX512-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP15]], <8 x i32> +; AVX512-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP16]], +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_4( +; AVX512VL-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512VL-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512VL-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T7]], i32 1 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T11]], i32 2 +; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T15]], i32 3 +; AVX512VL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX512VL-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512VL-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11 @@ -595,6 +693,29 @@ ; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_div( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> undef, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> undef, <8 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> +; AVX512VL-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] +; AVX512VL-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 %4 = getelementptr inbounds float, float* %1, i64 4 Index: llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512-SKX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2-SKX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=XOP @a64 = common global [8 x i64] zeroinitializer, align 64 @@ -109,6 +109,24 @@ ; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @ashr_v8i64( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-SKX-NEXT: ret void +; +; AVX2-SKX-LABEL: @ashr_v8i64( +; AVX2-SKX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] +; AVX2-SKX-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] +; AVX2-SKX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: ret void +; ; XOP-LABEL: @ashr_v8i64( ; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -260,6 +278,24 @@ ; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @ashr_v16i32( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-SKX-NEXT: ret void +; +; AVX2-SKX-LABEL: @ashr_v16i32( +; AVX2-SKX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX2-SKX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-SKX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX2-SKX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-SKX-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] +; AVX2-SKX-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] +; AVX2-SKX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX2-SKX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-SKX-NEXT: ret void +; ; XOP-LABEL: @ashr_v16i32( ; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 ; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 @@ -470,16 +506,27 @@ ; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 ; SSE-NEXT: ret void ; -; AVX-LABEL: @ashr_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 -; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: ret void +; AVX1-LABEL: @ashr_v32i16( +; AVX1-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX1-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX1-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX1-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @ashr_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX2-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @ashr_v32i16( ; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2 @@ -488,6 +535,24 @@ ; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @ashr_v32i16( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2 +; AVX512-SKX-NEXT: ret void +; +; AVX2-SKX-LABEL: @ashr_v32i16( +; AVX2-SKX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX2-SKX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-SKX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX2-SKX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-SKX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX2-SKX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX2-SKX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX2-SKX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-SKX-NEXT: ret void +; ; XOP-LABEL: @ashr_v32i16( ; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 ; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 @@ -499,6 +564,16 @@ ; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; XOP-NEXT: ret void ; +; AVX-LABEL: @ashr_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 @@ -650,16 +725,27 @@ ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; -; AVX-LABEL: @ashr_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 -; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: ret void +; AVX1-LABEL: @ashr_v64i8( +; AVX1-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX1-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX1-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX1-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @ashr_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX2-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @ashr_v64i8( ; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1 @@ -668,6 +754,24 @@ ; AVX512-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @ashr_v64i8( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = ashr <64 x i8> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1 +; AVX512-SKX-NEXT: ret void +; +; AVX2-SKX-LABEL: @ashr_v64i8( +; AVX2-SKX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX2-SKX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-SKX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX2-SKX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-SKX-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX2-SKX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX2-SKX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX2-SKX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-SKX-NEXT: ret void +; ; XOP-LABEL: @ashr_v64i8( ; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 ; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 @@ -679,6 +783,16 @@ ; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; XOP-NEXT: ret void ; +; AVX-LABEL: @ashr_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: ret void %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 Index: llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512-SKX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2-SKX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=XOP @a64 = common global [8 x i64] zeroinitializer, align 64 @@ -77,6 +77,24 @@ ; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @shl_v8i64( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-SKX-NEXT: ret void +; +; AVX2-SKX-LABEL: @shl_v8i64( +; AVX2-SKX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] +; AVX2-SKX-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] +; AVX2-SKX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-SKX-NEXT: ret void +; ; XOP-LABEL: @shl_v8i64( ; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 ; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 @@ -161,6 +179,13 @@ ; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @shl_v16i32( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-SKX-NEXT: ret void +; ; XOP-LABEL: @shl_v16i32( ; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 ; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 @@ -389,6 +414,13 @@ ; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @shl_v32i16( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = shl <32 x i16> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2 +; AVX512-SKX-NEXT: ret void +; ; XOP-LABEL: @shl_v32i16( ; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 ; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 @@ -569,6 +601,13 @@ ; AVX512-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1 ; AVX512-NEXT: ret void ; +; AVX512-SKX-LABEL: @shl_v64i8( +; AVX512-SKX-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1 +; AVX512-SKX-NEXT: [[TMP2:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1 +; AVX512-SKX-NEXT: [[TMP3:%.*]] = shl <64 x i8> [[TMP1]], [[TMP2]] +; AVX512-SKX-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1 +; AVX512-SKX-NEXT: ret void +; ; XOP-LABEL: @shl_v64i8( ; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 ; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: