Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -124,6 +124,15 @@ cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt + SLPThrottle("slp-throttle", cl::init(true), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt + SLPThrottleBudget("slp-throttling-budget", cl::init(32), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); + static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); @@ -595,11 +604,62 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - InstructionCost getSpillCost() const; + InstructionCost getSpillCost(); + + /// \returns the cost extracting vectorized elements. + InstructionCost getExtractCost() const; + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + InstructionCost getInsertCost(); + + struct TECostComparator { + bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const { + return LHS->Cost > RHS->Cost; + } + }; + using TEVectorizableSet = std::set; + + /// Find a subtree of the whole tree suitable to be vectorized. When + /// vectorizing the whole tree is not profitable, we can consider vectorizing + /// part of that tree. SLP algorithm looks to operations to vectorize starting + /// from seed instructions on the bottom toward the end of chains of + /// dependencies to the top of SLP graph, it groups potentially vectorizable + /// operations in scalar form to bundles. + /// For example: + /// + /// vector form + /// | + /// vector form vector form + /// \ / + /// vector form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// vector form + /// | + /// vector form gathered nodes + /// \ / + /// vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: + /// https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf + bool findSubTree(TEVectorizableSet &Vec, InstructionCost TreeCost, + InstructionCost UserCost); + + /// Get raw summary of all elements of the tree. + InstructionCost getRawTreeCost(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(); + InstructionCost getTreeCost(bool TreeReduce = false, + InstructionCost UserCost = 0); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -620,6 +680,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + ProposedToGather.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -628,6 +690,9 @@ } MinBWs.clear(); InstrElementSize.clear(); + NoCallInst = true; + RawTreeCost = 0; + IsCostSumReady = false; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -790,6 +855,9 @@ /// may not be necessary. bool isLoadCombineCandidate() const; + /// Cut the tree to make it partially vectorizable. + void cutTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1606,6 +1674,9 @@ /// Does this entry require reordering? SmallVector ReorderIndices; + /// Cost of this tree entry. + InstructionCost Cost = 0; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1618,6 +1689,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// Use of this entry. + TinyPtrVector UseEntries; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1850,8 +1924,10 @@ MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last); + } return Last; } @@ -1901,6 +1977,9 @@ }; using UserList = SmallVector; + /// \returns the cost of extracting the vectorized elements. + InstructionCost getExtractOperationCost(const ExternalUser &EU) const; + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1951,6 +2030,25 @@ /// after vectorization. UserList ExternalUses; + /// Tree entries that should not be vectorized due to throttling. + SmallPtrSet ProposedToGather; + + /// Raw cost of all elemts in the tree. + InstructionCost RawTreeCost = 0; + + /// Indicate that no CallInst found in the tree and we don't need to + /// calculate spill cost. + bool NoCallInst = true; + + /// True, if we have calucalte tree cost for the tree. + bool IsCostSumReady = false; + + /// Current operations width to vectorize. + unsigned BundleWidth = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -2293,6 +2391,9 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// Make the scheduling region smaller. + void reduceSchedulingRegion(Instruction *Start, Instruction *End); + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -2355,6 +2456,9 @@ /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; @@ -2569,7 +2673,7 @@ buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -2602,6 +2706,7 @@ // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. + InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); if (UseScalar != U || UseEntry->State == TreeEntry::ScatterVectorize || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { @@ -3328,6 +3433,50 @@ } } +void BoUpSLP::cutTree() { + SmallVector VecNodes; + + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::Vectorize && + Entry->State != TreeEntry::ScatterVectorize) + continue; + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + TreeEntry *UserTE = getTreeEntry(U); + if (!UserTE || ProposedToGather.count(UserTE) == 0) + continue; + // Ignore users in the user ignore list. + auto *UserInst = dyn_cast(U); + if (!UserInst) + continue; + + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + ExternalUses.emplace_back(Scalar, U, Lane); + } + } + } + // Canceling unprofitable elements. + for (TreeEntry *Entry : ProposedToGather) { + for (Value *V : Entry->Scalars) { + ScalarToTreeEntry.erase(V); +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); +#endif + } + } +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; @@ -4122,12 +4271,11 @@ return true; } -InstructionCost BoUpSLP::getSpillCost() const { +InstructionCost BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); InstructionCost Cost = 0; SmallPtrSet LiveValues; @@ -4192,6 +4340,7 @@ } if (NumCalls) { + NoCallInst = false; SmallVector V; for (auto *II : LiveValues) V.push_back(FixedVectorType::get(II->getType(), BundleWidth)); @@ -4204,15 +4353,109 @@ return Cost; } -InstructionCost BoUpSLP::getTreeCost() { - InstructionCost Cost = 0; +InstructionCost BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const { + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + Value *ScalarRoot = VectorizableTree.front()->Scalars[0]; + + auto It = MinBWs.find(ScalarRoot); + if (It != MinBWs.end()) { + uint64_t Width = It->second.first; + bool Signed = It->second.second; + auto *MinTy = IntegerType::get(F->getContext(), Width); + unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt; + VecTy = FixedVectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy, + EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +InstructionCost BoUpSLP::getExtractCost() const { + InstructionCost ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + // Consider the possibility of extracting vectorized + // values for canceled elements use. + for (TreeEntry *Entry : ProposedToGather) { + for (Value *V : Entry->Scalars) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = InternalTreeUses.find(V); + if (It != InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + ExtractCost += getExtractOperationCost(IU); + } + } + } + for (const ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +InstructionCost BoUpSLP::getInsertCost() { + InstructionCost InsertCost = 0; + for (TreeEntry *Entry : ProposedToGather) { + // Avoid already vectorized TreeEntries, it is already in a vector form and + // we don't need to gather those operations. + if (ProposedToGather.count(Entry) == 0) + continue; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + if (llvm::any_of(Inst->users(), [this](User *Op) { + return ScalarToTreeEntry.count(Op) > 0; + })) { + InsertCost += getEntryCost(Entry); + break; + } + } + } + return InsertCost; +} + +bool BoUpSLP::findSubTree(TEVectorizableSet &Vec, InstructionCost TreeCost, + InstructionCost UserCost) { + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + // Ignore any non-vectoriable entries, entries with low cost, + // or root entry. + if ((Entry->State != TreeEntry::Vectorize && + Entry->State != TreeEntry::ScatterVectorize) || + Entry->Cost <= 0 || !Entry->Idx) + continue; + Vec.insert(Entry); + } + InstructionCost Sum = 0; + for (TreeEntry *Entry : Vec) + Sum += Entry->Cost; + // Avoid reducing the tree if there is no potential room to reduce. + if ((TreeCost - UserCost - Sum) >= -SLPCostThreshold) + return false; + + return (Vec.size() > 0); +} + +InstructionCost BoUpSLP::getRawTreeCost() { + InstructionCost CostSum = 0; + BundleWidth = VectorizableTree.front()->Scalars.size(); LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); - - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -4227,69 +4470,103 @@ // existing heuristics based on tree size may yield different results. // if (TE.State == TreeEntry::NeedToGather && - std::any_of(std::next(VectorizableTree.begin(), I + 1), - VectorizableTree.end(), - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr->State == TreeEntry::NeedToGather && - EntryPtr->isSame(TE.Scalars); - })) + llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1), + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + EntryPtr->isSame(TE.Scalars); + })) continue; - InstructionCost C = getEntryCost(&TE); - Cost += C; - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] - << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); + << ".\n"); + CostSum += TE.Cost; + LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n"); } - SmallPtrSet ExtractCostCalculated; - InstructionCost ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; - - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::ScatterVectorize) continue; + InstructionCost GatherCost = 0; + for (TreeEntry *Gather : TE->UseEntries) + if (Gather->State != TreeEntry::Vectorize && + Gather->State != TreeEntry::ScatterVectorize) + GatherCost += Gather->Cost; + TE->Cost += GatherCost; + } + return CostSum; +} - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = FixedVectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); - } +InstructionCost BoUpSLP::getTreeCost(bool TreeReduce, + InstructionCost UserCost) { + InstructionCost CostSum; + if (!IsCostSumReady) { + CostSum = getRawTreeCost(); + RawTreeCost = CostSum; + } else { + CostSum = RawTreeCost; } - InstructionCost SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + InstructionCost ExtractCost = getExtractCost(); + InstructionCost SpillCost = 0; + if (!NoCallInst || !IsCostSumReady) + SpillCost = getSpillCost(); + assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost"); + if (!IsCostSumReady) + IsCostSumReady = true; + InstructionCost InsertCost = getInsertCost(); + InstructionCost Cost = + CostSum + ExtractCost + SpillCost + InsertCost - UserCost; #ifndef NDEBUG SmallString<256> Str; - { - raw_svector_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; - } + raw_svector_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Insert Cost = " << InsertCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; LLVM_DEBUG(dbgs() << Str); if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); #endif - + if (SLPThrottle && TreeReduce && (Cost - UserCost) >= -SLPCostThreshold) { + TEVectorizableSet Vec; + if (!findSubTree(Vec, Cost, UserCost)) + return Cost; + if (!NoCallInst && Vec.size() > SLPThrottleBudget) { + std::set::iterator It = + Vec.begin(); + std::advance(It, (unsigned)SLPThrottleBudget); + Vec.erase(It, Vec.end()); + } + + for (TreeEntry *T : Vec) { + ProposedToGather.insert(T); + T->State = TreeEntry::NeedToGather; + for (Value *V : T->Scalars) { + MustGather.insert(V); + ExternalUses.erase( + llvm::remove_if(ExternalUses, + [V](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + CostSum -= T->Cost; + ExtractCost = getExtractCost(); + if (!NoCallInst) + SpillCost = getSpillCost(); + assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost"); + InsertCost = getInsertCost(); + Cost = CostSum + ExtractCost + SpillCost + InsertCost - UserCost; + if (Cost < -SLPCostThreshold) { + cutTree(); + return Cost; + } + } + ProposedToGather.clear(); + } return Cost; } @@ -5136,12 +5413,25 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!ProposedToGather.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if ((Entry->State == TreeEntry::Vectorize || + Entry->State == TreeEntry::ScatterVectorize) && + !Entry->VectorizedValue) + vectorizeTree(Entry); + } + // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. @@ -5271,7 +5561,9 @@ #ifndef NDEBUG Type *Ty = Scalar->getType(); - if (!Ty->isVoidTy()) { + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!Ty->isVoidTy() && ProposedToGather.empty()) { for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); @@ -5496,6 +5788,7 @@ BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; + BundleMember->TE = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; if (BundleMember->UnscheduledDepsInBundle == 0) { ReadyInsts.insert(BundleMember); @@ -5764,6 +6057,85 @@ ReadyInsts.clear(); } +void BoUpSLP::BlockScheduling::reduceSchedulingRegion(Instruction *Start, + Instruction *End) { + if (Start) + ScheduleStart = Start; + if (End) + ScheduleEnd = End; +} + +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + SmallPtrSet Gathers; + SmallPtrSet Reduced; + Instruction *Start = nullptr; + + // We can reduce the number of instructions to be considered for scheduling, + // after cutting the tree. Here we shrink the scheduling area from the top, + // consecutively, untill we encounter the required instruction. There might be + // unnecessary NeedToGather nodes with the relationship only to other + // NeedToGather nodes and unmap instructions in chains, we could safely + // delete those. + for (std::unique_ptr &TEPtr : reverse(VectorizableTree)) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::NeedToGather || !TE->getOpcode() || + TE->getMainOp()->getParent() != BS->BB) + continue; + for (const EdgeInfo &EI : TE->UserTreeIndices) { + if (EI.UserTE && (EI.UserTE->State != TreeEntry::NeedToGather)) { + auto InstructionsOnly = + make_filter_range(TE->Scalars, Instruction::classof); + for (Value *V : InstructionsOnly) + Gathers.insert(cast(V)); + break; + } + } + } + + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (!getTreeEntry(I) && !Gathers.count(I)) { + Reduced.insert(I); + } else { + Start = I; + break; + } + } + + BS->reduceSchedulingRegion(Start, nullptr); + + for (TreeEntry *Entry : ProposedToGather) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + SD->IsScheduled = false; + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + + if (Reduced.size()) { + for (Instruction *I : Reduced) { + ScheduleData *SD = BS->getScheduleData(I); + if (SD) + SD->SchedulingRegionID = -1; + } + } + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -6293,7 +6665,7 @@ R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(true); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -6499,6 +6871,7 @@ // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL); + if (!S.getOpcode()) return false; @@ -6593,7 +6966,7 @@ continue; R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost UserCost = 0; CandidateFound = true; if (CompensateUseCost) { // TODO: Use TTI's getScalarizationOverhead for sequence of inserts @@ -6623,7 +6996,6 @@ // Switching to the TTI interface might help a bit. // Alternative solution could be pattern-match to detect a no-op or // shuffle. - InstructionCost UserCost = 0; for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { auto *IE = cast(InsertUses[I + Lane]); if (auto *CI = dyn_cast(IE->getOperand(2))) @@ -6632,8 +7004,8 @@ } LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost << ".\n"); - Cost -= UserCost; } + InstructionCost Cost = R.getTreeCost(true, UserCost); MinCost = std::min(MinCost, Cost); Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s +; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t -slp-throttle=false | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=REMARK %s -; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s +; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t -slp-throttle=false | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=REMARK %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -204,11 +204,15 @@ ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 ; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1 +; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP3]], i32 1 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[P4]], i32 2 +; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[P6]], i32 3 +; MAX-COST-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -220,19 +224,21 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 +; MAX-COST-NEXT: [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i32 0 +; MAX-COST-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1 +; MAX-COST-NEXT: [[TMP12:%.*]] = insertelement <4 x i1> [[TMP10]], i1 [[TMP11]], i32 1 +; MAX-COST-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2 +; MAX-COST-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP13]], i32 2 +; MAX-COST-NEXT: [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP8]], i32 3 +; MAX-COST-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]]) +; MAX-COST-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], [[P27]] +; MAX-COST-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP19]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 Index: llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -9,6 +9,16 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux" +; YAML: --- !Passed +; YAML-NEXT: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: test_select +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '3' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '5' + ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedHorizontalReduction @@ -19,6 +29,17 @@ ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '8' +; YAML: --- !Passed +; YAML-NEXT: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: test_select +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '-1' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + + define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) { ; CHECK-LABEL: @test_select( ; CHECK-NEXT: entry: @@ -28,35 +49,40 @@ ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_LR_PH]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[J_025]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 1, i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[TMP12]], [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP14]], [[H]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP15]], [[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[S_0_LCSSA]] ; entry: Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -233,14 +233,16 @@ ; GFX9-NEXT: ret void ; ; VI-LABEL: @canonicalize_v2f16( -; VI-NEXT: [[I0:%.*]] = load half, half addrspace(3)* [[A:%.*]], align 2 -; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]]) -; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, half addrspace(3)* [[A]], i64 1 -; VI-NEXT: [[I3:%.*]] = load half, half addrspace(3)* [[ARRAYIDX3]], align 2 -; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]]) -; VI-NEXT: store half [[CANONICALIZE0]], half addrspace(3)* [[C:%.*]], align 2 -; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half addrspace(3)* [[C]], i64 1 -; VI-NEXT: store half [[CANONICALIZE1]], half addrspace(3)* [[ARRAYIDX5]], align 2 +; VI-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* +; VI-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 +; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x half> [[TMP2]], i32 0 +; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[TMP3]]) +; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP2]], i32 1 +; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[TMP4]]) +; VI-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[CANONICALIZE0]], i32 0 +; VI-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[CANONICALIZE1]], i32 1 +; VI-NEXT: [[TMP7:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* +; VI-NEXT: store <2 x half> [[TMP6]], <2 x half> addrspace(3)* [[TMP7]], align 2 ; VI-NEXT: ret void ; %i0 = load half, half addrspace(3)* %a, align 2 Index: llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll @@ -24,53 +24,53 @@ ; CHECK-NEXT: [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[CONV]], -128 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP5]] to i32 -; CHECK-NEXT: [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128 -; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1 -; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 128, [[CONV]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]] -; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1 -; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]] -; CHECK-NEXT: [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i32> , [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP8]], <2 x i32> [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]] -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1 -; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP14]] to i32 ; CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]] ; CHECK-NEXT: [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8 ; CHECK-NEXT: store i8 [[CONV17]], i8* [[ADD_PTR]], align 1 ; CHECK-NEXT: [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]] -; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP15]], 0 ; CHECK-NEXT: [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8 ; CHECK-NEXT: store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1 ; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128 -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32 -; CHECK-NEXT: [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128 -; CHECK-NEXT: [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1 -; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]] -; CHECK-NEXT: [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]] -; CHECK-NEXT: [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1 -; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]] -; CHECK-NEXT: [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[CONV3_1]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[CONV_1]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw <2 x i32> [[TMP19]], +; CHECK-NEXT: [[TMP21:%.*]] = icmp sgt <2 x i32> [[TMP20]], +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <2 x i32> , [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = select <2 x i1> [[TMP21]], <2 x i32> [[TMP20]], <2 x i32> [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] ; CHECK-NEXT: [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP26]] to i32 ; CHECK-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]] ; CHECK-NEXT: [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8 ; CHECK-NEXT: store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP27]], 0 ; CHECK-NEXT: [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8 ; CHECK-NEXT: store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1 ; CHECK-NEXT: [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]] Index: llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,49 +7,65 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]] -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_EXTRA26]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP19:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[VAL_0:%.*]] = add i32 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[VAL_1:%.*]] = and i32 [[TMP3]], [[VAL_0]] +; CHECK-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] +; CHECK-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] +; CHECK-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] +; CHECK-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] +; CHECK-NEXT: [[VAL_6:%.*]] = add i32 [[TMP3]], 55 +; CHECK-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], [[VAL_6]] +; CHECK-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] +; CHECK-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] +; CHECK-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] +; CHECK-NEXT: [[VAL_11:%.*]] = add i32 [[TMP3]], 285 +; CHECK-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], [[VAL_11]] +; CHECK-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] +; CHECK-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] +; CHECK-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] +; CHECK-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] +; CHECK-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] +; CHECK-NEXT: [[VAL_18:%.*]] = add i32 [[TMP3]], 1240 +; CHECK-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], [[VAL_18]] +; CHECK-NEXT: [[VAL_20:%.*]] = add i32 [[TMP3]], 1496 +; CHECK-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]] +; CHECK-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] +; CHECK-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] +; CHECK-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] +; CHECK-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] +; CHECK-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] +; CHECK-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] +; CHECK-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] +; CHECK-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] +; CHECK-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] +; CHECK-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] +; CHECK-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] +; CHECK-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] +; CHECK-NEXT: [[VAL_34:%.*]] = add i32 [[TMP3]], 8555 +; CHECK-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]] +; CHECK-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] +; CHECK-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] +; CHECK-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[VAL_40:%.*]] = and i32 [[VAL_38]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = and <2 x i32> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP19]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP18]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( Index: llvm/test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -348,22 +348,24 @@ define void @no_vec_shuff_reorder() #0 { ; CHECK-LABEL: @no_vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] -; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fadd float [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fsub float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP11]], i32 3 +; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 Index: llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll +++ llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll @@ -283,70 +283,82 @@ ; SLM-NEXT: ret void ; ; AVX1-LABEL: @smul_v16i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3) -; AVX1-NEXT: [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3) -; AVX1-NEXT: [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3) -; AVX1-NEXT: [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3) -; AVX1-NEXT: [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3) -; AVX1-NEXT: [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3) -; AVX1-NEXT: [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3) -; AVX1-NEXT: [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3) -; AVX1-NEXT: [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3) -; AVX1-NEXT: [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3) -; AVX1-NEXT: [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3) -; AVX1-NEXT: [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3) -; AVX1-NEXT: [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3) -; AVX1-NEXT: [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3) -; AVX1-NEXT: [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3) -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP9]], i32 [[TMP10]], i32 3) +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP11]], i32 [[TMP12]], i32 3) +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; AVX1-NEXT: [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP13]], i32 [[TMP14]], i32 3) +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; AVX1-NEXT: [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP15]], i32 [[TMP16]], i32 3) +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 +; AVX1-NEXT: [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP17]], i32 [[TMP18]], i32 3) +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1 +; AVX1-NEXT: [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP19]], i32 [[TMP20]], i32 3) +; AVX1-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 +; AVX1-NEXT: [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP21]], i32 [[TMP22]], i32 3) +; AVX1-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP23]], i32 [[TMP24]], i32 3) +; AVX1-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 +; AVX1-NEXT: [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP25]], i32 [[TMP26]], i32 3) +; AVX1-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 +; AVX1-NEXT: [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP27]], i32 [[TMP28]], i32 3) +; AVX1-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 +; AVX1-NEXT: [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP29]], i32 [[TMP30]], i32 3) +; AVX1-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 +; AVX1-NEXT: [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP31]], i32 [[TMP32]], i32 3) +; AVX1-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; AVX1-NEXT: [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP33]], i32 [[TMP34]], i32 3) +; AVX1-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; AVX1-NEXT: [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP35]], i32 [[TMP36]], i32 3) +; AVX1-NEXT: [[TMP37:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; AVX1-NEXT: [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP37]], i32 [[TMP38]], i32 3) +; AVX1-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; AVX1-NEXT: [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[TMP39]], i32 [[TMP40]], i32 3) +; AVX1-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[R0]], i32 0 +; AVX1-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[R1]], i32 1 +; AVX1-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[R2]], i32 2 +; AVX1-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[R3]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> poison, i32 [[R4]], i32 0 +; AVX1-NEXT: [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[R5]], i32 1 +; AVX1-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[R6]], i32 2 +; AVX1-NEXT: [[TMP48:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[R7]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP48]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> poison, i32 [[R8]], i32 0 +; AVX1-NEXT: [[TMP50:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[R9]], i32 1 +; AVX1-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[R10]], i32 2 +; AVX1-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[R11]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP52]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP53:%.*]] = insertelement <4 x i32> poison, i32 [[R12]], i32 0 +; AVX1-NEXT: [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[R13]], i32 1 +; AVX1-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[R14]], i32 2 +; AVX1-NEXT: [[TMP56:%.*]] = insertelement <4 x i32> [[TMP55]], i32 [[R15]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP56]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @smul_v16i32( @@ -1212,70 +1224,82 @@ ; SLM-NEXT: ret void ; ; AVX1-LABEL: @umul_v16i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3) -; AVX1-NEXT: [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3) -; AVX1-NEXT: [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3) -; AVX1-NEXT: [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3) -; AVX1-NEXT: [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3) -; AVX1-NEXT: [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3) -; AVX1-NEXT: [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3) -; AVX1-NEXT: [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3) -; AVX1-NEXT: [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3) -; AVX1-NEXT: [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3) -; AVX1-NEXT: [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3) -; AVX1-NEXT: [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3) -; AVX1-NEXT: [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3) -; AVX1-NEXT: [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3) -; AVX1-NEXT: [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3) -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP9]], i32 [[TMP10]], i32 3) +; AVX1-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX1-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP11]], i32 [[TMP12]], i32 3) +; AVX1-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX1-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; AVX1-NEXT: [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP13]], i32 [[TMP14]], i32 3) +; AVX1-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX1-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; AVX1-NEXT: [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP15]], i32 [[TMP16]], i32 3) +; AVX1-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; AVX1-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 +; AVX1-NEXT: [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP17]], i32 [[TMP18]], i32 3) +; AVX1-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; AVX1-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1 +; AVX1-NEXT: [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP19]], i32 [[TMP20]], i32 3) +; AVX1-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; AVX1-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 +; AVX1-NEXT: [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP21]], i32 [[TMP22]], i32 3) +; AVX1-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; AVX1-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP23]], i32 [[TMP24]], i32 3) +; AVX1-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 +; AVX1-NEXT: [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP25]], i32 [[TMP26]], i32 3) +; AVX1-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 +; AVX1-NEXT: [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP27]], i32 [[TMP28]], i32 3) +; AVX1-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; AVX1-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 +; AVX1-NEXT: [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP29]], i32 [[TMP30]], i32 3) +; AVX1-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 +; AVX1-NEXT: [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP31]], i32 [[TMP32]], i32 3) +; AVX1-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; AVX1-NEXT: [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP33]], i32 [[TMP34]], i32 3) +; AVX1-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; AVX1-NEXT: [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP35]], i32 [[TMP36]], i32 3) +; AVX1-NEXT: [[TMP37:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; AVX1-NEXT: [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP37]], i32 [[TMP38]], i32 3) +; AVX1-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; AVX1-NEXT: [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[TMP39]], i32 [[TMP40]], i32 3) +; AVX1-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[R0]], i32 0 +; AVX1-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[R1]], i32 1 +; AVX1-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[R2]], i32 2 +; AVX1-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[R3]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> poison, i32 [[R4]], i32 0 +; AVX1-NEXT: [[TMP46:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[R5]], i32 1 +; AVX1-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[R6]], i32 2 +; AVX1-NEXT: [[TMP48:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[R7]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP48]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> poison, i32 [[R8]], i32 0 +; AVX1-NEXT: [[TMP50:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[R9]], i32 1 +; AVX1-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[R10]], i32 2 +; AVX1-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[R11]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP52]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP53:%.*]] = insertelement <4 x i32> poison, i32 [[R12]], i32 0 +; AVX1-NEXT: [[TMP54:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[R13]], i32 1 +; AVX1-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[R14]], i32 2 +; AVX1-NEXT: [[TMP56:%.*]] = insertelement <4 x i32> [[TMP55]], i32 [[R15]], i32 3 +; AVX1-NEXT: store <4 x i32> [[TMP56]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @umul_v16i32( Index: llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll +++ llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll @@ -15,8 +15,10 @@ ; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 ; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1 -; CHECK-NEXT: store i64 [[A_AND]], i64* [[PTR]], align 8 -; CHECK-NEXT: store i64 [[B_AND]], i64* [[GEP]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[A_AND]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B_AND]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[PTR]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,38 +16,59 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] -; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 -; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1 -; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1 -; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1 -; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] -; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1 -; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] -; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1 -; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] -; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1 -; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] -; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1 -; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1 -; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1 -; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1 -; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1 -; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1 -; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1 -; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1 -; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1 +; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]] +; SSE-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[TMP19]], i32 15 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <16 x i8> [[TMP19]], i32 0 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> poison, i8 [[TMP21]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP19]], i32 1 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP23]], i32 1 +; SSE-NEXT: [[TMP25:%.*]] = extractelement <16 x i8> [[TMP19]], i32 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP25]], i32 2 +; SSE-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP19]], i32 3 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP27]], i32 3 +; SSE-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP19]], i32 4 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP29]], i32 4 +; SSE-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP19]], i32 5 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP31]], i32 5 +; SSE-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP19]], i32 6 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP33]], i32 6 +; SSE-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP19]], i32 7 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP35]], i32 7 +; SSE-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP19]], i32 8 +; SSE-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP37]], i32 8 +; SSE-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP19]], i32 9 +; SSE-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP39]], i32 9 +; SSE-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP19]], i32 10 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <16 x i8> [[TMP40]], i8 [[TMP41]], i32 10 +; SSE-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP19]], i32 11 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP43]], i32 11 +; SSE-NEXT: [[TMP45:%.*]] = extractelement <16 x i8> [[TMP19]], i32 12 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP45]], i32 12 +; SSE-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP19]], i32 13 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP47]], i32 13 +; SSE-NEXT: [[TMP49:%.*]] = extractelement <16 x i8> [[TMP19]], i32 14 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP49]], i32 14 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP20]], i32 15 +; SSE-NEXT: store <16 x i8> [[TMP51]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( Index: llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -11,7 +11,6 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1 ; CHECK-NEXT: [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8 ; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1 @@ -26,8 +25,10 @@ ; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; CHECK-NEXT: [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] ; CHECK-NEXT: [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8 -; CHECK-NEXT: store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double*> poison, double* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double*> [[TMP5]], double* [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double** [[_M_CUR2_I_I]] to <2 x double*>* +; CHECK-NEXT: store <2 x double*> [[TMP6]], <2 x double*>* [[TMP7]], align 8 ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] ; CHECK: if.then.i55: ; CHECK-NEXT: br label [[WHILE_COND]] Index: llvm/test/Transforms/SLPVectorizer/X86/cse.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -21,18 +21,24 @@ ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP5]], 4.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL11]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP16]], <4 x double>* [[TMP17]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -103,18 +103,20 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctlz_4i32( -; AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; AVX2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; AVX2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; AVX2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; AVX2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; AVX2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; AVX2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 false) +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP3]], i1 false) +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP4]], i1 false) +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP5]], i1 false) +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTLZ0]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ1]], i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ2]], i32 2 +; AVX2-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTLZ3]], i32 3 +; AVX2-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; AVX2-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -559,18 +561,20 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctlz_undef_4i32( -; AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; AVX2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; AVX2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true) -; AVX2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true) -; AVX2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true) -; AVX2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true) -; AVX2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 true) +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP3]], i1 true) +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP4]], i1 true) +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP5]], i1 true) +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTLZ0]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTLZ1]], i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTLZ2]], i32 2 +; AVX2-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTLZ3]], i32 3 +; AVX2-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; AVX2-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 Index: llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll +++ llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll @@ -123,33 +123,37 @@ ; SSE2-NEXT: ret void ; ; SSE42-LABEL: @ctpop_4i32( -; SSE42-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; SSE42-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; SSE42-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; SSE42-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; SSE42-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]]) +; SSE42-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]]) +; SSE42-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]]) +; SSE42-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]]) +; SSE42-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0 +; SSE42-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP1]], i32 1 +; SSE42-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP2]], i32 2 +; SSE42-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTPOP3]], i32 3 +; SSE42-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; SSE42-NEXT: ret void ; ; AVX-LABEL: @ctpop_4i32( -; AVX-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; AVX-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; AVX-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; AVX-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; AVX-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; AVX-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; AVX-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; AVX-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; AVX-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]]) +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]]) +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]]) +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]]) +; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTPOP1]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTPOP2]], i32 2 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTPOP3]], i32 3 +; AVX-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; AVX-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -178,57 +182,63 @@ ; SSE2-NEXT: ret void ; ; SSE42-LABEL: @ctpop_8i32( -; SSE42-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; SSE42-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; SSE42-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; SSE42-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; SSE42-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; SSE42-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; SSE42-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; SSE42-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; SSE42-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]]) -; SSE42-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]]) -; SSE42-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]]) -; SSE42-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]]) -; SSE42-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; SSE42-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; SSE42-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; SSE42-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; SSE42-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; SSE42-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; SSE42-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; SSE42-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE42-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]]) +; SSE42-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE42-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]]) +; SSE42-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE42-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]]) +; SSE42-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE42-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP6]]) +; SSE42-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; SSE42-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP7]]) +; SSE42-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; SSE42-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP8]]) +; SSE42-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; SSE42-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP9]]) +; SSE42-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; SSE42-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP10]]) +; SSE42-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP0]], i32 0 +; SSE42-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CTPOP1]], i32 1 +; SSE42-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CTPOP2]], i32 2 +; SSE42-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CTPOP3]], i32 3 +; SSE42-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CTPOP4]], i32 0 +; SSE42-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CTPOP5]], i32 1 +; SSE42-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CTPOP6]], i32 2 +; SSE42-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CTPOP7]], i32 3 +; SSE42-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE42-NEXT: ret void ; ; AVX1-LABEL: @ctpop_8i32( -; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2 -; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2 -; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2 -; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2 -; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2 -; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2 -; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2 -; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2 -; AVX1-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]]) -; AVX1-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]]) -; AVX1-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]]) -; AVX1-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]]) -; AVX1-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]]) -; AVX1-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]]) -; AVX1-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]]) -; AVX1-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]]) -; AVX1-NEXT: store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2 -; AVX1-NEXT: store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2 -; AVX1-NEXT: store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2 -; AVX1-NEXT: store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2 -; AVX1-NEXT: store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2 -; AVX1-NEXT: store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2 -; AVX1-NEXT: store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2 -; AVX1-NEXT: store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2 +; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX1-NEXT: [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP2]]) +; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX1-NEXT: [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP3]]) +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; AVX1-NEXT: [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP4]]) +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; AVX1-NEXT: [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP5]]) +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; AVX1-NEXT: [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP6]]) +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; AVX1-NEXT: [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP7]]) +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; AVX1-NEXT: [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP8]]) +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; AVX1-NEXT: [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP9]]) +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CTPOP0]], i32 0 +; AVX1-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CTPOP1]], i32 1 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CTPOP2]], i32 2 +; AVX1-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CTPOP3]], i32 3 +; AVX1-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CTPOP4]], i32 4 +; AVX1-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CTPOP5]], i32 5 +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CTPOP6]], i32 6 +; AVX1-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CTPOP7]], i32 7 +; AVX1-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ctpop_8i32( Index: llvm/test/Transforms/SLPVectorizer/X86/cttz.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/cttz.ll +++ llvm/test/Transforms/SLPVectorizer/X86/cttz.ll @@ -103,18 +103,20 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @cttz_4i32( -; AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; AVX2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; AVX2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) -; AVX2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) -; AVX2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) -; AVX2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) -; AVX2-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX2-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX2-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX2-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP2]], i1 false) +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP3]], i1 false) +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP4]], i1 false) +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP5]], i1 false) +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTTZ0]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ1]], i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ2]], i32 2 +; AVX2-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTTZ3]], i32 3 +; AVX2-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; AVX2-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 @@ -559,18 +561,20 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @cttz_undef_4i32( -; AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 -; AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4 -; AVX2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4 -; AVX2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true) -; AVX2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true) -; AVX2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true) -; AVX2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true) -; AVX2-NEXT: store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX2-NEXT: store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX2-NEXT: store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX2-NEXT: store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4 +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP2]], i1 true) +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP3]], i1 true) +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX2-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP4]], i1 true) +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX2-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP5]], i1 true) +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CTTZ0]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CTTZ1]], i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[CTTZ2]], i32 2 +; AVX2-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[CTTZ3]], i32 3 +; AVX2-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4 ; AVX2-NEXT: ret void ; %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4 Index: llvm/test/Transforms/SLPVectorizer/X86/fma.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fma.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fma.ll @@ -26,16 +26,20 @@ define void @fma_2f64() #0 { ; NO-FMA-LABEL: @fma_2f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; NO-FMA-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[TMP5]], double [[TMP6]]) +; NO-FMA-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]]) +; NO-FMA-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0 +; NO-FMA-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[FMA1]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP11]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_2f64( @@ -61,26 +65,34 @@ define void @fma_4f64() #0 { ; NO-FMA-LABEL: @fma_4f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]]) +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; NO-FMA-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP10]], double [[TMP11]], double [[TMP12]]) +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]]) +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]]) +; NO-FMA-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0 +; NO-FMA-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FMA1]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP20]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; NO-FMA-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0 +; NO-FMA-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[FMA3]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP22]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_4f64( @@ -116,46 +128,62 @@ define void @fma_8f64() #0 { ; NO-FMA-LABEL: @fma_8f64( -; NO-FMA-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]]) -; NO-FMA-NEXT: store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; NO-FMA-NEXT: [[FMA0:%.*]] = call double @llvm.fma.f64(double [[TMP13]], double [[TMP14]], double [[TMP15]]) +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; NO-FMA-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 +; NO-FMA-NEXT: [[FMA1:%.*]] = call double @llvm.fma.f64(double [[TMP16]], double [[TMP17]], double [[TMP18]]) +; NO-FMA-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; NO-FMA-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 +; NO-FMA-NEXT: [[FMA2:%.*]] = call double @llvm.fma.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]]) +; NO-FMA-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP24:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 +; NO-FMA-NEXT: [[FMA3:%.*]] = call double @llvm.fma.f64(double [[TMP22]], double [[TMP23]], double [[TMP24]]) +; NO-FMA-NEXT: [[TMP25:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP26:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; NO-FMA-NEXT: [[TMP27:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 +; NO-FMA-NEXT: [[FMA4:%.*]] = call double @llvm.fma.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]]) +; NO-FMA-NEXT: [[TMP28:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP29:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; NO-FMA-NEXT: [[TMP30:%.*]] = extractelement <2 x double> [[TMP11]], i32 1 +; NO-FMA-NEXT: [[FMA5:%.*]] = call double @llvm.fma.f64(double [[TMP28]], double [[TMP29]], double [[TMP30]]) +; NO-FMA-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +; NO-FMA-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[TMP12]], i32 0 +; NO-FMA-NEXT: [[FMA6:%.*]] = call double @llvm.fma.f64(double [[TMP31]], double [[TMP32]], double [[TMP33]]) +; NO-FMA-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; NO-FMA-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 +; NO-FMA-NEXT: [[TMP36:%.*]] = extractelement <2 x double> [[TMP12]], i32 1 +; NO-FMA-NEXT: [[FMA7:%.*]] = call double @llvm.fma.f64(double [[TMP34]], double [[TMP35]], double [[TMP36]]) +; NO-FMA-NEXT: [[TMP37:%.*]] = insertelement <2 x double> poison, double [[FMA0]], i32 0 +; NO-FMA-NEXT: [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[FMA1]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP38]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP39:%.*]] = insertelement <2 x double> poison, double [[FMA2]], i32 0 +; NO-FMA-NEXT: [[TMP40:%.*]] = insertelement <2 x double> [[TMP39]], double [[FMA3]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP40]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP41:%.*]] = insertelement <2 x double> poison, double [[FMA4]], i32 0 +; NO-FMA-NEXT: [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[FMA5]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP42]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; NO-FMA-NEXT: [[TMP43:%.*]] = insertelement <2 x double> poison, double [[FMA6]], i32 0 +; NO-FMA-NEXT: [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[FMA7]], i32 1 +; NO-FMA-NEXT: store <2 x double> [[TMP44]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA256-LABEL: @fma_8f64( @@ -224,26 +252,30 @@ define void @fma_4f32() #0 { ; NO-FMA-LABEL: @fma_4f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; NO-FMA-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float [[TMP5]], float [[TMP6]]) +; NO-FMA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]]) +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; NO-FMA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; NO-FMA-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]]) +; NO-FMA-NEXT: [[TMP16:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0 +; NO-FMA-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FMA1]], i32 1 +; NO-FMA-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FMA2]], i32 2 +; NO-FMA-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[FMA3]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP19]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_4f32( @@ -279,46 +311,54 @@ define void @fma_8f32() #0 { ; NO-FMA-LABEL: @fma_8f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; NO-FMA-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 0 +; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]]) +; NO-FMA-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; NO-FMA-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP5]], i32 1 +; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]]) +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP5]], i32 2 +; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]]) +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; NO-FMA-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP5]], i32 3 +; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]]) +; NO-FMA-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP6]], i32 0 +; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]]) +; NO-FMA-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; NO-FMA-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP6]], i32 1 +; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]]) +; NO-FMA-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; NO-FMA-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; NO-FMA-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]]) +; NO-FMA-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; NO-FMA-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; NO-FMA-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]]) +; NO-FMA-NEXT: [[TMP31:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0 +; NO-FMA-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FMA1]], i32 1 +; NO-FMA-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[FMA2]], i32 2 +; NO-FMA-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FMA3]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP34]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP35:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0 +; NO-FMA-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FMA5]], i32 1 +; NO-FMA-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[FMA6]], i32 2 +; NO-FMA-NEXT: [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[FMA7]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP38]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA-LABEL: @fma_8f32( @@ -374,86 +414,102 @@ define void @fma_16f32() #0 { ; NO-FMA-LABEL: @fma_16f32( -; NO-FMA-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4 -; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]]) -; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]]) -; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]]) -; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]]) -; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]]) -; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]]) -; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]]) -; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]]) -; NO-FMA-NEXT: [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]]) -; NO-FMA-NEXT: [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]]) -; NO-FMA-NEXT: [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]]) -; NO-FMA-NEXT: [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]]) -; NO-FMA-NEXT: [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]]) -; NO-FMA-NEXT: [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]]) -; NO-FMA-NEXT: [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]]) -; NO-FMA-NEXT: [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]]) -; NO-FMA-NEXT: store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; NO-FMA-NEXT: store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; NO-FMA-NEXT: store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; NO-FMA-NEXT: store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; NO-FMA-NEXT: store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; NO-FMA-NEXT: store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; NO-FMA-NEXT: store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; NO-FMA-NEXT: store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; NO-FMA-NEXT: store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; NO-FMA-NEXT: store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; NO-FMA-NEXT: store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; NO-FMA-NEXT: store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; NO-FMA-NEXT: store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; NO-FMA-NEXT: store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; NO-FMA-NEXT: store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; NO-FMA-NEXT: store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; NO-FMA-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; NO-FMA-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP5]], i32 0 +; NO-FMA-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 +; NO-FMA-NEXT: [[FMA0:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP14]], float [[TMP15]]) +; NO-FMA-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; NO-FMA-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP5]], i32 1 +; NO-FMA-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP9]], i32 1 +; NO-FMA-NEXT: [[FMA1:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]]) +; NO-FMA-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; NO-FMA-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP5]], i32 2 +; NO-FMA-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP9]], i32 2 +; NO-FMA-NEXT: [[FMA2:%.*]] = call float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]]) +; NO-FMA-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; NO-FMA-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP5]], i32 3 +; NO-FMA-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP9]], i32 3 +; NO-FMA-NEXT: [[FMA3:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP23]], float [[TMP24]]) +; NO-FMA-NEXT: [[TMP25:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; NO-FMA-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[TMP6]], i32 0 +; NO-FMA-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; NO-FMA-NEXT: [[FMA4:%.*]] = call float @llvm.fma.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]]) +; NO-FMA-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; NO-FMA-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[TMP6]], i32 1 +; NO-FMA-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; NO-FMA-NEXT: [[FMA5:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP29]], float [[TMP30]]) +; NO-FMA-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; NO-FMA-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; NO-FMA-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; NO-FMA-NEXT: [[FMA6:%.*]] = call float @llvm.fma.f32(float [[TMP31]], float [[TMP32]], float [[TMP33]]) +; NO-FMA-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; NO-FMA-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; NO-FMA-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; NO-FMA-NEXT: [[FMA7:%.*]] = call float @llvm.fma.f32(float [[TMP34]], float [[TMP35]], float [[TMP36]]) +; NO-FMA-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; NO-FMA-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[TMP7]], i32 0 +; NO-FMA-NEXT: [[TMP39:%.*]] = extractelement <4 x float> [[TMP11]], i32 0 +; NO-FMA-NEXT: [[FMA8:%.*]] = call float @llvm.fma.f32(float [[TMP37]], float [[TMP38]], float [[TMP39]]) +; NO-FMA-NEXT: [[TMP40:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; NO-FMA-NEXT: [[TMP41:%.*]] = extractelement <4 x float> [[TMP7]], i32 1 +; NO-FMA-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 +; NO-FMA-NEXT: [[FMA9:%.*]] = call float @llvm.fma.f32(float [[TMP40]], float [[TMP41]], float [[TMP42]]) +; NO-FMA-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; NO-FMA-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[TMP7]], i32 2 +; NO-FMA-NEXT: [[TMP45:%.*]] = extractelement <4 x float> [[TMP11]], i32 2 +; NO-FMA-NEXT: [[FMA10:%.*]] = call float @llvm.fma.f32(float [[TMP43]], float [[TMP44]], float [[TMP45]]) +; NO-FMA-NEXT: [[TMP46:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; NO-FMA-NEXT: [[TMP47:%.*]] = extractelement <4 x float> [[TMP7]], i32 3 +; NO-FMA-NEXT: [[TMP48:%.*]] = extractelement <4 x float> [[TMP11]], i32 3 +; NO-FMA-NEXT: [[FMA11:%.*]] = call float @llvm.fma.f32(float [[TMP46]], float [[TMP47]], float [[TMP48]]) +; NO-FMA-NEXT: [[TMP49:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; NO-FMA-NEXT: [[TMP50:%.*]] = extractelement <4 x float> [[TMP8]], i32 0 +; NO-FMA-NEXT: [[TMP51:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 +; NO-FMA-NEXT: [[FMA12:%.*]] = call float @llvm.fma.f32(float [[TMP49]], float [[TMP50]], float [[TMP51]]) +; NO-FMA-NEXT: [[TMP52:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; NO-FMA-NEXT: [[TMP53:%.*]] = extractelement <4 x float> [[TMP8]], i32 1 +; NO-FMA-NEXT: [[TMP54:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 +; NO-FMA-NEXT: [[FMA13:%.*]] = call float @llvm.fma.f32(float [[TMP52]], float [[TMP53]], float [[TMP54]]) +; NO-FMA-NEXT: [[TMP55:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; NO-FMA-NEXT: [[TMP56:%.*]] = extractelement <4 x float> [[TMP8]], i32 2 +; NO-FMA-NEXT: [[TMP57:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 +; NO-FMA-NEXT: [[FMA14:%.*]] = call float @llvm.fma.f32(float [[TMP55]], float [[TMP56]], float [[TMP57]]) +; NO-FMA-NEXT: [[TMP58:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; NO-FMA-NEXT: [[TMP59:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 +; NO-FMA-NEXT: [[TMP60:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 +; NO-FMA-NEXT: [[FMA15:%.*]] = call float @llvm.fma.f32(float [[TMP58]], float [[TMP59]], float [[TMP60]]) +; NO-FMA-NEXT: [[TMP61:%.*]] = insertelement <4 x float> poison, float [[FMA0]], i32 0 +; NO-FMA-NEXT: [[TMP62:%.*]] = insertelement <4 x float> [[TMP61]], float [[FMA1]], i32 1 +; NO-FMA-NEXT: [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[FMA2]], i32 2 +; NO-FMA-NEXT: [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[FMA3]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP64]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP65:%.*]] = insertelement <4 x float> poison, float [[FMA4]], i32 0 +; NO-FMA-NEXT: [[TMP66:%.*]] = insertelement <4 x float> [[TMP65]], float [[FMA5]], i32 1 +; NO-FMA-NEXT: [[TMP67:%.*]] = insertelement <4 x float> [[TMP66]], float [[FMA6]], i32 2 +; NO-FMA-NEXT: [[TMP68:%.*]] = insertelement <4 x float> [[TMP67]], float [[FMA7]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP68]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP69:%.*]] = insertelement <4 x float> poison, float [[FMA8]], i32 0 +; NO-FMA-NEXT: [[TMP70:%.*]] = insertelement <4 x float> [[TMP69]], float [[FMA9]], i32 1 +; NO-FMA-NEXT: [[TMP71:%.*]] = insertelement <4 x float> [[TMP70]], float [[FMA10]], i32 2 +; NO-FMA-NEXT: [[TMP72:%.*]] = insertelement <4 x float> [[TMP71]], float [[FMA11]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP72]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; NO-FMA-NEXT: [[TMP73:%.*]] = insertelement <4 x float> poison, float [[FMA12]], i32 0 +; NO-FMA-NEXT: [[TMP74:%.*]] = insertelement <4 x float> [[TMP73]], float [[FMA13]], i32 1 +; NO-FMA-NEXT: [[TMP75:%.*]] = insertelement <4 x float> [[TMP74]], float [[FMA14]], i32 2 +; NO-FMA-NEXT: [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[FMA15]], i32 3 +; NO-FMA-NEXT: store <4 x float> [[TMP76]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; NO-FMA-NEXT: ret void ; ; FMA256-LABEL: @fma_16f32( Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll @@ -21,59 +21,40 @@ define void @fptosi_8f64_8i64() #0 { ; SSE-LABEL: @fptosi_8f64_8i64( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[TMP5]] to i64 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[TMP6]] to i64 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[TMP7]] to i64 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[TMP8]] to i64 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[TMP9]] to i64 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[TMP10]] to i64 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[TMP11]] to i64 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[TMP12]] to i64 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptosi_8f64_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void -; ; AVX512-LABEL: @fptosi_8f64_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 ; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64> @@ -254,57 +235,69 @@ define void @fptosi_8f32_8i64() #0 { ; SSE-LABEL: @fptosi_8f32_8i64( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptosi float [[TMP5]] to i64 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptosi float [[TMP6]] to i64 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT2:%.*]] = fptosi float [[TMP7]] to i64 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT3:%.*]] = fptosi float [[TMP8]] to i64 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptosi float [[TMP9]] to i64 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptosi float [[TMP10]] to i64 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT6:%.*]] = fptosi float [[TMP11]] to i64 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT7:%.*]] = fptosi float [[TMP12]] to i64 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptosi_8f32_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi float [[TMP3]] to i64 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi float [[TMP4]] to i64 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi float [[TMP5]] to i64 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi float [[TMP6]] to i64 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi float [[TMP7]] to i64 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi float [[TMP8]] to i64 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi float [[TMP9]] to i64 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi float [[TMP10]] to i64 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptosi_8f32_8i64( Index: llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -21,59 +21,40 @@ define void @fptosi_8f64_8i64() #0 { ; SSE-LABEL: @fptosi_8f64_8i64( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[TMP5]] to i64 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[TMP6]] to i64 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[TMP7]] to i64 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[TMP8]] to i64 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[TMP9]] to i64 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[TMP10]] to i64 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[TMP11]] to i64 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[TMP12]] to i64 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptosi_8f64_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void -; ; AVX512-LABEL: @fptosi_8f64_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 ; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64> @@ -254,57 +235,69 @@ define void @fptosi_8f32_8i64() #0 { ; SSE-LABEL: @fptosi_8f32_8i64( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptosi float [[TMP5]] to i64 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptosi float [[TMP6]] to i64 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT2:%.*]] = fptosi float [[TMP7]] to i64 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT3:%.*]] = fptosi float [[TMP8]] to i64 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptosi float [[TMP9]] to i64 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptosi float [[TMP10]] to i64 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT6:%.*]] = fptosi float [[TMP11]] to i64 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT7:%.*]] = fptosi float [[TMP12]] to i64 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptosi_8f32_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptosi float [[TMP3]] to i64 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptosi float [[TMP4]] to i64 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptosi float [[TMP5]] to i64 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptosi float [[TMP6]] to i64 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptosi float [[TMP7]] to i64 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptosi float [[TMP8]] to i64 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptosi float [[TMP9]] to i64 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptosi float [[TMP10]] to i64 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptosi_8f32_8i64( Index: llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -21,59 +21,40 @@ define void @fptoui_8f64_8i64() #0 { ; SSE-LABEL: @fptoui_8f64_8i64( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[TMP5]] to i64 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[TMP6]] to i64 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[TMP7]] to i64 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[TMP8]] to i64 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[TMP9]] to i64 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[TMP10]] to i64 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[TMP11]] to i64 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[TMP12]] to i64 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f64_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: ret void -; ; AVX512-LABEL: @fptoui_8f64_8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 ; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64> @@ -118,57 +99,63 @@ define void @fptoui_8f64_8i32() #0 { ; SSE-LABEL: @fptoui_8f64_8i32( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i32 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i32 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i32 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i32 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i32 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[TMP3]] to i32 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[TMP4]] to i32 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[TMP5]] to i32 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[TMP6]] to i32 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[TMP7]] to i32 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[TMP8]] to i32 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[TMP9]] to i32 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[TMP10]] to i32 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CVT1]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CVT2]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CVT3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CVT5]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CVT6]], i32 2 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CVT7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f64_8i32( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i32 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i32 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i32 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i32 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i32 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[TMP2]] to i32 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 +; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[TMP3]] to i32 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 +; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[TMP4]] to i32 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 +; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[TMP5]] to i32 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 +; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[TMP6]] to i32 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 +; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[TMP7]] to i32 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 +; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[TMP8]] to i32 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 +; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[TMP9]] to i32 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7 +; AVX256NODQ-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 ; AVX256NODQ-NEXT: ret void ; ; AVX-LABEL: @fptoui_8f64_8i32( @@ -299,57 +286,69 @@ define void @fptoui_8f32_8i64() #0 { ; SSE-LABEL: @fptoui_8f32_8i64( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i64 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i64 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i64 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i64 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i64 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i64 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i64 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i64 -; SSE-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; SSE-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; SSE-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; SSE-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; SSE-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; SSE-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; SSE-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; SSE-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; SSE-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([16 x float]* @src32 to <2 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6) to <2 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[TMP5]] to i64 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[TMP6]] to i64 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[TMP7]] to i64 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[TMP8]] to i64 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[TMP9]] to i64 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[TMP10]] to i64 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[TMP11]] to i64 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[TMP12]] to i64 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[CVT1]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* bitcast ([8 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[CVT2]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP15]], i64 [[CVT3]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> [[TMP17]], i64 [[CVT5]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP18]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[CVT6]], i32 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP19]], i64 [[CVT7]], i32 1 +; SSE-NEXT: store <2 x i64> [[TMP20]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f32_8i64( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i64 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i64 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i64 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i64 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i64 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i64 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i64 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i64 -; AVX256NODQ-NEXT: store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[TMP3]] to i64 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[TMP4]] to i64 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[TMP5]] to i64 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[TMP6]] to i64 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[TMP7]] to i64 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[TMP8]] to i64 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[TMP9]] to i64 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[TMP10]] to i64 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[CVT4]], i32 0 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[CVT5]], i32 1 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[CVT6]], i32 2 +; AVX256NODQ-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[CVT7]], i32 3 +; AVX256NODQ-NEXT: store <4 x i64> [[TMP18]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256NODQ-NEXT: ret void ; ; AVX512-LABEL: @fptoui_8f32_8i64( @@ -396,57 +395,63 @@ define void @fptoui_8f32_8i32() #0 { ; SSE-LABEL: @fptoui_8f32_8i32( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[TMP3]] to i32 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[TMP4]] to i32 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[TMP5]] to i32 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[TMP6]] to i32 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[TMP7]] to i32 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[TMP8]] to i32 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[TMP9]] to i32 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[TMP10]] to i32 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CVT1]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CVT2]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[CVT3]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[CVT4]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[CVT5]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[CVT6]], i32 2 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[CVT7]], i32 3 +; SSE-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX256NODQ-LABEL: @fptoui_8f32_8i32( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; AVX256NODQ-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[TMP2]] to i32 +; AVX256NODQ-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 +; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[TMP3]] to i32 +; AVX256NODQ-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 +; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[TMP4]] to i32 +; AVX256NODQ-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 +; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[TMP5]] to i32 +; AVX256NODQ-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 +; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[TMP6]] to i32 +; AVX256NODQ-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 +; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[TMP7]] to i32 +; AVX256NODQ-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 +; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[TMP8]] to i32 +; AVX256NODQ-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 +; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[TMP9]] to i32 +; AVX256NODQ-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> poison, i32 [[CVT0]], i32 0 +; AVX256NODQ-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[CVT1]], i32 1 +; AVX256NODQ-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[CVT2]], i32 2 +; AVX256NODQ-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[CVT3]], i32 3 +; AVX256NODQ-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[CVT4]], i32 4 +; AVX256NODQ-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[CVT5]], i32 5 +; AVX256NODQ-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[CVT6]], i32 6 +; AVX256NODQ-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[CVT7]], i32 7 +; AVX256NODQ-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 ; AVX256NODQ-NEXT: ret void ; ; AVX-LABEL: @fptoui_8f32_8i32( Index: llvm/test/Transforms/SLPVectorizer/X86/fround.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fround.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fround.ll @@ -27,12 +27,14 @@ define void @ceil_2f64() #0 { ; SSE2-LABEL: @ceil_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[CEIL1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_2f64( @@ -58,18 +60,22 @@ define void @ceil_4f64() #0 { ; SSE2-LABEL: @ceil_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[CEIL1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[CEIL2]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[CEIL3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_4f64( @@ -104,30 +110,38 @@ define void @ceil_8f64() #0 { ; SSE2-LABEL: @ceil_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) -; SSE2-NEXT: [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]]) -; SSE2-NEXT: [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]]) -; SSE2-NEXT: [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]]) -; SSE2-NEXT: [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]]) -; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE2-NEXT: [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE2-NEXT: [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE2-NEXT: [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE2-NEXT: [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[CEIL1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[CEIL2]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[CEIL3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[CEIL4]], i32 0 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[CEIL5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[CEIL6]], i32 0 +; SSE2-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[CEIL7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_8f64( @@ -198,12 +212,14 @@ define void @floor_2f64() #0 { ; SSE2-LABEL: @floor_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[FLOOR1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_2f64( @@ -229,18 +245,22 @@ define void @floor_4f64() #0 { ; SSE2-LABEL: @floor_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[FLOOR1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[FLOOR2]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[FLOOR3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_4f64( @@ -275,30 +295,38 @@ define void @floor_8f64() #0 { ; SSE2-LABEL: @floor_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) -; SSE2-NEXT: [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]]) -; SSE2-NEXT: [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]]) -; SSE2-NEXT: [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]]) -; SSE2-NEXT: [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]]) -; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE2-NEXT: [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE2-NEXT: [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE2-NEXT: [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE2-NEXT: [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[FLOOR1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[FLOOR2]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[FLOOR3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[FLOOR4]], i32 0 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[FLOOR5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[FLOOR6]], i32 0 +; SSE2-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[FLOOR7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_8f64( @@ -369,12 +397,14 @@ define void @nearbyint_2f64() #0 { ; SSE2-LABEL: @nearbyint_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[NEARBYINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_2f64( @@ -400,18 +430,22 @@ define void @nearbyint_4f64() #0 { ; SSE2-LABEL: @nearbyint_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[NEARBYINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT2]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[NEARBYINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_4f64( @@ -446,30 +480,38 @@ define void @nearbyint_8f64() #0 { ; SSE2-LABEL: @nearbyint_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) -; SSE2-NEXT: [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]]) -; SSE2-NEXT: [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]]) -; SSE2-NEXT: [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]]) -; SSE2-NEXT: [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]]) -; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE2-NEXT: [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE2-NEXT: [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE2-NEXT: [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE2-NEXT: [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[NEARBYINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT2]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[NEARBYINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT4]], i32 0 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[NEARBYINT5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[NEARBYINT6]], i32 0 +; SSE2-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[NEARBYINT7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_8f64( @@ -540,12 +582,14 @@ define void @rint_2f64() #0 { ; SSE2-LABEL: @rint_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[RINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_2f64( @@ -571,18 +615,22 @@ define void @rint_4f64() #0 { ; SSE2-LABEL: @rint_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[RINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[RINT2]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[RINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_4f64( @@ -617,30 +665,38 @@ define void @rint_8f64() #0 { ; SSE2-LABEL: @rint_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) -; SSE2-NEXT: [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]]) -; SSE2-NEXT: [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]]) -; SSE2-NEXT: [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]]) -; SSE2-NEXT: [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]]) -; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE2-NEXT: [[RINT4:%.*]] = call double @llvm.rint.f64(double [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE2-NEXT: [[RINT5:%.*]] = call double @llvm.rint.f64(double [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE2-NEXT: [[RINT6:%.*]] = call double @llvm.rint.f64(double [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE2-NEXT: [[RINT7:%.*]] = call double @llvm.rint.f64(double [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[RINT1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[RINT2]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[RINT3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[RINT4]], i32 0 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[RINT5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[RINT6]], i32 0 +; SSE2-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[RINT7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_8f64( @@ -711,12 +767,14 @@ define void @trunc_2f64() #0 { ; SSE2-LABEL: @trunc_2f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TRUNC1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_2f64( @@ -742,18 +800,22 @@ define void @trunc_4f64() #0 { ; SSE2-LABEL: @trunc_4f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TRUNC1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TRUNC2]], i32 0 +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TRUNC3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_4f64( @@ -788,30 +850,38 @@ define void @trunc_8f64() #0 { ; SSE2-LABEL: @trunc_8f64( -; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) -; SSE2-NEXT: [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]]) -; SSE2-NEXT: [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]]) -; SSE2-NEXT: [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]]) -; SSE2-NEXT: [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]]) -; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 -; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 -; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 -; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 -; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 -; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 -; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE2-NEXT: [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE2-NEXT: [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; SSE2-NEXT: [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; SSE2-NEXT: [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TRUNC1]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP14]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TRUNC2]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <2 x double> [[TMP15]], double [[TRUNC3]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[TRUNC4]], i32 0 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[TRUNC5]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP18]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TRUNC6]], i32 0 +; SSE2-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TRUNC7]], i32 1 +; SSE2-NEXT: store <2 x double> [[TMP20]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_8f64( @@ -882,18 +952,20 @@ define void @ceil_4f32() #0 { ; SSE2-LABEL: @ceil_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CEIL1]], i32 1 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CEIL2]], i32 2 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CEIL3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_4f32( @@ -925,30 +997,34 @@ define void @ceil_8f32() #0 { ; SSE2-LABEL: @ceil_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) -; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) -; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) -; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CEIL1]], i32 1 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CEIL2]], i32 2 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CEIL3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CEIL4]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CEIL5]], i32 1 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CEIL6]], i32 2 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CEIL7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_8f32( @@ -995,54 +1071,62 @@ define void @ceil_16f32() #0 { ; SSE2-LABEL: @ceil_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) -; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) -; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) -; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) -; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) -; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) -; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) -; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) -; SSE2-NEXT: [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]]) -; SSE2-NEXT: [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]]) -; SSE2-NEXT: [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]]) -; SSE2-NEXT: [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]]) -; SSE2-NEXT: [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]]) -; SSE2-NEXT: [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]]) -; SSE2-NEXT: [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]]) -; SSE2-NEXT: [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]]) -; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; SSE2-NEXT: [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[TMP13]]) +; SSE2-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; SSE2-NEXT: [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[TMP14]]) +; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; SSE2-NEXT: [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[TMP15]]) +; SSE2-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; SSE2-NEXT: [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[TMP16]]) +; SSE2-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; SSE2-NEXT: [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[TMP17]]) +; SSE2-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; SSE2-NEXT: [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[TMP18]]) +; SSE2-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; SSE2-NEXT: [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[TMP19]]) +; SSE2-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; SSE2-NEXT: [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[TMP20]]) +; SSE2-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CEIL0]], i32 0 +; SSE2-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CEIL1]], i32 1 +; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CEIL2]], i32 2 +; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CEIL3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CEIL4]], i32 0 +; SSE2-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CEIL5]], i32 1 +; SSE2-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CEIL6]], i32 2 +; SSE2-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CEIL7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CEIL8]], i32 0 +; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CEIL9]], i32 1 +; SSE2-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CEIL10]], i32 2 +; SSE2-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CEIL11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CEIL12]], i32 0 +; SSE2-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CEIL13]], i32 1 +; SSE2-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CEIL14]], i32 2 +; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CEIL15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @ceil_16f32( @@ -1137,18 +1221,20 @@ define void @floor_4f32() #0 { ; SSE2-LABEL: @floor_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[FLOOR1]], i32 1 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[FLOOR2]], i32 2 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[FLOOR3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_4f32( @@ -1180,30 +1266,34 @@ define void @floor_8f32() #0 { ; SSE2-LABEL: @floor_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) -; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) -; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) -; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[FLOOR1]], i32 1 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[FLOOR2]], i32 2 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[FLOOR3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[FLOOR4]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[FLOOR5]], i32 1 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[FLOOR6]], i32 2 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[FLOOR7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_8f32( @@ -1250,54 +1340,62 @@ define void @floor_16f32() #0 { ; SSE2-LABEL: @floor_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) -; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) -; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) -; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) -; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) -; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) -; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) -; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) -; SSE2-NEXT: [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]]) -; SSE2-NEXT: [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]]) -; SSE2-NEXT: [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]]) -; SSE2-NEXT: [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]]) -; SSE2-NEXT: [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]]) -; SSE2-NEXT: [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]]) -; SSE2-NEXT: [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]]) -; SSE2-NEXT: [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]]) -; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; SSE2-NEXT: [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[TMP13]]) +; SSE2-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; SSE2-NEXT: [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[TMP14]]) +; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; SSE2-NEXT: [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[TMP15]]) +; SSE2-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; SSE2-NEXT: [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[TMP16]]) +; SSE2-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; SSE2-NEXT: [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[TMP17]]) +; SSE2-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; SSE2-NEXT: [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[TMP18]]) +; SSE2-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; SSE2-NEXT: [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[TMP19]]) +; SSE2-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; SSE2-NEXT: [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[TMP20]]) +; SSE2-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[FLOOR0]], i32 0 +; SSE2-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[FLOOR1]], i32 1 +; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[FLOOR2]], i32 2 +; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[FLOOR3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[FLOOR4]], i32 0 +; SSE2-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[FLOOR5]], i32 1 +; SSE2-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[FLOOR6]], i32 2 +; SSE2-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[FLOOR7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[FLOOR8]], i32 0 +; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[FLOOR9]], i32 1 +; SSE2-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[FLOOR10]], i32 2 +; SSE2-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[FLOOR11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[FLOOR12]], i32 0 +; SSE2-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[FLOOR13]], i32 1 +; SSE2-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[FLOOR14]], i32 2 +; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[FLOOR15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @floor_16f32( @@ -1392,18 +1490,20 @@ define void @nearbyint_4f32() #0 { ; SSE2-LABEL: @nearbyint_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[NEARBYINT1]], i32 1 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[NEARBYINT2]], i32 2 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[NEARBYINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_4f32( @@ -1435,30 +1535,34 @@ define void @nearbyint_8f32() #0 { ; SSE2-LABEL: @nearbyint_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) -; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) -; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) -; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[NEARBYINT1]], i32 1 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[NEARBYINT2]], i32 2 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[NEARBYINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT4]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[NEARBYINT5]], i32 1 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[NEARBYINT6]], i32 2 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[NEARBYINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_8f32( @@ -1505,54 +1609,62 @@ define void @nearbyint_16f32() #0 { ; SSE2-LABEL: @nearbyint_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) -; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) -; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) -; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) -; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) -; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) -; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) -; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) -; SSE2-NEXT: [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]]) -; SSE2-NEXT: [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]]) -; SSE2-NEXT: [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]]) -; SSE2-NEXT: [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]]) -; SSE2-NEXT: [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]]) -; SSE2-NEXT: [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]]) -; SSE2-NEXT: [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]]) -; SSE2-NEXT: [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]]) -; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; SSE2-NEXT: [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[TMP13]]) +; SSE2-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; SSE2-NEXT: [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[TMP14]]) +; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; SSE2-NEXT: [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[TMP15]]) +; SSE2-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; SSE2-NEXT: [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[TMP16]]) +; SSE2-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; SSE2-NEXT: [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[TMP17]]) +; SSE2-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; SSE2-NEXT: [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[TMP18]]) +; SSE2-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; SSE2-NEXT: [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[TMP19]]) +; SSE2-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; SSE2-NEXT: [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[TMP20]]) +; SSE2-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT0]], i32 0 +; SSE2-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[NEARBYINT1]], i32 1 +; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[NEARBYINT2]], i32 2 +; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[NEARBYINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT4]], i32 0 +; SSE2-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[NEARBYINT5]], i32 1 +; SSE2-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[NEARBYINT6]], i32 2 +; SSE2-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[NEARBYINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT8]], i32 0 +; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[NEARBYINT9]], i32 1 +; SSE2-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[NEARBYINT10]], i32 2 +; SSE2-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[NEARBYINT11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[NEARBYINT12]], i32 0 +; SSE2-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[NEARBYINT13]], i32 1 +; SSE2-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[NEARBYINT14]], i32 2 +; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[NEARBYINT15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @nearbyint_16f32( @@ -1647,18 +1759,20 @@ define void @rint_4f32() #0 { ; SSE2-LABEL: @rint_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[RINT1]], i32 1 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[RINT2]], i32 2 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[RINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_4f32( @@ -1690,30 +1804,34 @@ define void @rint_8f32() #0 { ; SSE2-LABEL: @rint_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) -; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) -; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) -; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[RINT1]], i32 1 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[RINT2]], i32 2 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[RINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[RINT4]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[RINT5]], i32 1 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[RINT6]], i32 2 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[RINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_8f32( @@ -1760,54 +1878,62 @@ define void @rint_16f32() #0 { ; SSE2-LABEL: @rint_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) -; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) -; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) -; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) -; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) -; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) -; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) -; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) -; SSE2-NEXT: [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]]) -; SSE2-NEXT: [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]]) -; SSE2-NEXT: [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]]) -; SSE2-NEXT: [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]]) -; SSE2-NEXT: [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]]) -; SSE2-NEXT: [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]]) -; SSE2-NEXT: [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]]) -; SSE2-NEXT: [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]]) -; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; SSE2-NEXT: [[RINT8:%.*]] = call float @llvm.rint.f32(float [[TMP13]]) +; SSE2-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; SSE2-NEXT: [[RINT9:%.*]] = call float @llvm.rint.f32(float [[TMP14]]) +; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; SSE2-NEXT: [[RINT10:%.*]] = call float @llvm.rint.f32(float [[TMP15]]) +; SSE2-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; SSE2-NEXT: [[RINT11:%.*]] = call float @llvm.rint.f32(float [[TMP16]]) +; SSE2-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; SSE2-NEXT: [[RINT12:%.*]] = call float @llvm.rint.f32(float [[TMP17]]) +; SSE2-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; SSE2-NEXT: [[RINT13:%.*]] = call float @llvm.rint.f32(float [[TMP18]]) +; SSE2-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; SSE2-NEXT: [[RINT14:%.*]] = call float @llvm.rint.f32(float [[TMP19]]) +; SSE2-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; SSE2-NEXT: [[RINT15:%.*]] = call float @llvm.rint.f32(float [[TMP20]]) +; SSE2-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[RINT0]], i32 0 +; SSE2-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[RINT1]], i32 1 +; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[RINT2]], i32 2 +; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[RINT3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[RINT4]], i32 0 +; SSE2-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[RINT5]], i32 1 +; SSE2-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[RINT6]], i32 2 +; SSE2-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[RINT7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[RINT8]], i32 0 +; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[RINT9]], i32 1 +; SSE2-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[RINT10]], i32 2 +; SSE2-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[RINT11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[RINT12]], i32 0 +; SSE2-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[RINT13]], i32 1 +; SSE2-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[RINT14]], i32 2 +; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[RINT15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @rint_16f32( @@ -1902,18 +2028,20 @@ define void @trunc_4f32() #0 { ; SSE2-LABEL: @trunc_4f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP2]]) +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TRUNC1]], i32 1 +; SSE2-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TRUNC2]], i32 2 +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TRUNC3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_4f32( @@ -1945,30 +2073,34 @@ define void @trunc_8f32() #0 { ; SSE2-LABEL: @trunc_8f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) -; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) -; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) -; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP3]]) +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP4]]) +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TRUNC1]], i32 1 +; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TRUNC2]], i32 2 +; SSE2-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TRUNC3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TRUNC4]], i32 0 +; SSE2-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TRUNC5]], i32 1 +; SSE2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TRUNC6]], i32 2 +; SSE2-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TRUNC7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_8f32( @@ -2015,54 +2147,62 @@ define void @trunc_16f32() #0 { ; SSE2-LABEL: @trunc_16f32( -; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 -; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 -; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 -; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 -; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 -; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 -; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 -; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 -; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) -; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) -; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) -; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) -; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) -; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) -; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) -; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) -; SSE2-NEXT: [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]]) -; SSE2-NEXT: [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]]) -; SSE2-NEXT: [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]]) -; SSE2-NEXT: [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]]) -; SSE2-NEXT: [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]]) -; SSE2-NEXT: [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]]) -; SSE2-NEXT: [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]]) -; SSE2-NEXT: [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]]) -; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 -; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 -; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 -; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 -; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE2-NEXT: store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 -; SSE2-NEXT: store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE2-NEXT: store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 -; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 -; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 -; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[TMP5]]) +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[TMP6]]) +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[TMP7]]) +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[TMP8]]) +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[TMP9]]) +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[TMP10]]) +; SSE2-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[TMP11]]) +; SSE2-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[TMP12]]) +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; SSE2-NEXT: [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[TMP13]]) +; SSE2-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; SSE2-NEXT: [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[TMP14]]) +; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 +; SSE2-NEXT: [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[TMP15]]) +; SSE2-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 +; SSE2-NEXT: [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[TMP16]]) +; SSE2-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; SSE2-NEXT: [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[TMP17]]) +; SSE2-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 +; SSE2-NEXT: [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[TMP18]]) +; SSE2-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; SSE2-NEXT: [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[TMP19]]) +; SSE2-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 +; SSE2-NEXT: [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[TMP20]]) +; SSE2-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TRUNC0]], i32 0 +; SSE2-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TRUNC1]], i32 1 +; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TRUNC2]], i32 2 +; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TRUNC3]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[TRUNC4]], i32 0 +; SSE2-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TRUNC5]], i32 1 +; SSE2-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TRUNC6]], i32 2 +; SSE2-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[TRUNC7]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[TRUNC8]], i32 0 +; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TRUNC9]], i32 1 +; SSE2-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TRUNC10]], i32 2 +; SSE2-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[TRUNC11]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE2-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[TRUNC12]], i32 0 +; SSE2-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TRUNC13]], i32 1 +; SSE2-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TRUNC14]], i32 2 +; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TRUNC15]], i32 3 +; SSE2-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE2-NEXT: ret void ; ; SSE41-LABEL: @trunc_16f32( Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -112,12 +112,15 @@ ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 ; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 ; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T5]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP5]], i32 1 ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float ; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -112,12 +112,15 @@ ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 ; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 ; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T5]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP5]], i32 1 ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float ; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 Index: llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -58,12 +58,18 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { ; SSE-LABEL: @PR31243_sext( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 -; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 -; SSE-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] -; SSE-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] +; SSE-NEXT: [[TMP0:%.*]] = sext i8 [[V0:%.*]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = sext i8 [[V1:%.*]] to i32 +; SSE-NEXT: [[TMP_0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; SSE-NEXT: [[TMP_1:%.*]] = insertelement <2 x i32> [[TMP_0]], i32 [[TMP1]], i32 1 +; SSE-NEXT: [[TMP2:%.*]] = trunc <2 x i32> [[TMP_1]] to <2 x i16> +; SSE-NEXT: [[TMP3:%.*]] = or <2 x i16> [[TMP2]], +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] ; SSE-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 ; SSE-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 ; SSE-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] @@ -86,6 +92,23 @@ ; AVX-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] ; AVX-NEXT: ret i8 [[TMP8]] ; +; AVX2-LABEL: @PR31243_sext( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 +; AVX2-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; AVX2-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 +; AVX2-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; AVX2-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 +; AVX2-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 +; AVX2-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] +; AVX2-NEXT: ret i8 [[TMP8]] +; entry: %tmp0 = sext i8 %v0 to i32 %tmp1 = sext i8 %v1 to i32 Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll +++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -60,35 +60,34 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ ; AVX1-LABEL: @powof2div_nonuniform( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 ; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 ; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 -; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 ; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 +; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; AVX1-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 2 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4 +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8 +; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16 ; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 +; AVX1-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1 +; AVX1-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @powof2div_nonuniform( Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -70,31 +70,28 @@ ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 ; SSE-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 -; SSE-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 ; SSE-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 -; SSE-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 -; SSE-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 -; SSE-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[AND_1]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[AND]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 -; SSE-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 -; SSE-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] -; SSE-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 -; SSE-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 +; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 -; SSE-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 -; SSE-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 -; SSE-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 -; SSE-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 -; SSE-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] -; SSE-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], ; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 -; SSE-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 -; SSE-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] -; SSE-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 +; SSE-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 +; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], +; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( Index: llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i32( ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]] @@ -16,7 +16,7 @@ ; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; %4 = load i32, i32* %0, align 4, !tbaa !2 @@ -52,7 +52,7 @@ define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i8( ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP0:%.*]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, [[TBAA4:!tbaa !.*]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, !tbaa [[TBAA4:![0-9]+]] ; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer @@ -62,7 +62,7 @@ ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP10]], <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, [[TBAA4]] +; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, !tbaa [[TBAA4]] ; CHECK-NEXT: ret void ; %4 = load i8, i8* %0, align 1, !tbaa !6 @@ -104,88 +104,109 @@ } define void @store_i64(i64* nocapture %0, i32 %1, i32 %2) { -; SSE-LABEL: @store_i64( -; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]] -; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 -; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 -; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 -; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; SSE-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 -; SSE-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]] -; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] -; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 -; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 -; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 -; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; SSE-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]] -; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; SSE-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]] -; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] -; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 -; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 -; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 -; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 -; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; SSE-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 -; SSE-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]] -; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] -; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 -; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 -; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 -; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 -; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; SSE-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]] -; SSE-NEXT: ret void +; SSE2-LABEL: @store_i64( +; SSE2-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; SSE2-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] +; SSE2-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 +; SSE2-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; SSE2-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 +; SSE2-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 +; SSE2-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 +; SSE2-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 +; SSE2-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] +; SSE2-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 +; SSE2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; SSE2-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 +; SSE2-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 +; SSE2-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 +; SSE2-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 +; SSE2-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] +; SSE2-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 +; SSE2-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; SSE2-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 +; SSE2-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 +; SSE2-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 +; SSE2-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 +; SSE2-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] +; SSE2-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 +; SSE2-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; SSE2-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 +; SSE2-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 +; SSE2-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 +; SSE2-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE2-NEXT: ret void +; +; SSE42-LABEL: @store_i64( +; SSE42-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; SSE42-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <2 x i64>* +; SSE42-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE42-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE42-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], [[TMP4]] +; SSE42-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE42-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], [[TMP4]] +; SSE42-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0 +; SSE42-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 +; SSE42-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP12]], +; SSE42-NEXT: [[TMP14:%.*]] = trunc <2 x i64> [[TMP13]] to <2 x i32> +; SSE42-NEXT: [[TMP15:%.*]] = icmp ult <2 x i32> [[TMP14]], +; SSE42-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP13]], +; SSE42-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP15]], <2 x i64> [[TMP16]], <2 x i64> +; SSE42-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* +; SSE42-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* [[TMP18]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 +; SSE42-NEXT: [[TMP20:%.*]] = bitcast i64* [[TMP19]] to <2 x i64>* +; SSE42-NEXT: [[TMP21:%.*]] = load <2 x i64>, <2 x i64>* [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP21]], i32 0 +; SSE42-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], [[TMP4]] +; SSE42-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP21]], i32 1 +; SSE42-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], [[TMP4]] +; SSE42-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> poison, i64 [[TMP23]], i32 0 +; SSE42-NEXT: [[TMP27:%.*]] = insertelement <2 x i64> [[TMP26]], i64 [[TMP25]], i32 1 +; SSE42-NEXT: [[TMP28:%.*]] = lshr <2 x i64> [[TMP27]], +; SSE42-NEXT: [[TMP29:%.*]] = trunc <2 x i64> [[TMP28]] to <2 x i32> +; SSE42-NEXT: [[TMP30:%.*]] = icmp ult <2 x i32> [[TMP29]], +; SSE42-NEXT: [[TMP31:%.*]] = and <2 x i64> [[TMP28]], +; SSE42-NEXT: [[TMP32:%.*]] = select <2 x i1> [[TMP30]], <2 x i64> [[TMP31]], <2 x i64> +; SSE42-NEXT: [[TMP33:%.*]] = bitcast i64* [[TMP19]] to <2 x i64>* +; SSE42-NEXT: store <2 x i64> [[TMP32]], <2 x i64>* [[TMP33]], align 8, !tbaa [[TBAA5]] +; SSE42-NEXT: ret void ; ; AVX1-LABEL: @store_i64( ; AVX1-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX1-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, [[TBAA5:!tbaa !.*]] -; AVX1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 -; AVX1-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; AVX1-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 -; AVX1-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 -; AVX1-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; AVX1-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 -; AVX1-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] -; AVX1-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 -; AVX1-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; AVX1-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 -; AVX1-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 -; AVX1-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; AVX1-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; AVX1-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] -; AVX1-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 -; AVX1-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 -; AVX1-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 -; AVX1-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 -; AVX1-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; AVX1-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 -; AVX1-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, [[TBAA5]] -; AVX1-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] -; AVX1-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 -; AVX1-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 -; AVX1-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 -; AVX1-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 -; AVX1-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; AVX1-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, [[TBAA5]] +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], [[TMP4]] +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], [[TMP4]] +; AVX1-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> undef, <2 x i32> +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> undef, <2 x i32> zeroinitializer +; AVX1-NEXT: [[TMP14:%.*]] = mul <2 x i64> [[TMP11]], [[TMP13]] +; AVX1-NEXT: [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP14]], <2 x i64> undef, <4 x i32> +; AVX1-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0 +; AVX1-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP10]], i32 1 +; AVX1-NEXT: [[TMP18:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> [[TMP15]], <4 x i32> +; AVX1-NEXT: [[TMP19:%.*]] = lshr <4 x i64> [[TMP18]], +; AVX1-NEXT: [[TMP20:%.*]] = trunc <4 x i64> [[TMP19]] to <4 x i32> +; AVX1-NEXT: [[TMP21:%.*]] = icmp ult <4 x i32> [[TMP20]], +; AVX1-NEXT: [[TMP22:%.*]] = and <4 x i64> [[TMP19]], +; AVX1-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP21]], <4 x i64> [[TMP22]], <4 x i64> +; AVX1-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* +; AVX1-NEXT: store <4 x i64> [[TMP23]], <4 x i64>* [[TMP24]], align 8, !tbaa [[TBAA5]] ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @store_i64( ; AVX2-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, [[TBAA5:!tbaa !.*]] +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] ; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer ; AVX2-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]] @@ -195,8 +216,24 @@ ; AVX2-NEXT: [[TMP13:%.*]] = and <4 x i64> [[TMP10]], ; AVX2-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* -; AVX2-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, [[TBAA5]] +; AVX2-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]] ; AVX2-NEXT: ret void +; +; AVX512-LABEL: @store_i64( +; AVX512-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX512-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[TMP6]], [[TMP8]] +; AVX512-NEXT: [[TMP10:%.*]] = lshr <4 x i64> [[TMP9]], +; AVX512-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = icmp ult <4 x i32> [[TMP11]], +; AVX512-NEXT: [[TMP13:%.*]] = and <4 x i64> [[TMP10]], +; AVX512-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP13]], <4 x i64> +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* +; AVX512-NEXT: store <4 x i64> [[TMP14]], <4 x i64>* [[TMP15]], align 8, !tbaa [[TBAA5]] +; AVX512-NEXT: ret void ; %4 = zext i32 %1 to i64 %5 = load i64, i64* %0, align 8, !tbaa !7 Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -8,19 +8,19 @@ define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; CHECK-LABEL: @gather_load( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]] +; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 @@ -46,66 +46,66 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_2( ; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]] +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 ; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; SSE-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 ; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; SSE-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 ; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; SSE-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; SSE-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]] +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 ; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0:!tbaa !.*]] +; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], ; AVX2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]] +; AVX2-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_2( ; AVX512-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 ; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX512-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0:!tbaa !.*]] +; AVX512-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], ; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]] +; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 @@ -133,144 +133,87 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_3( -; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; SSE-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; SSE-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; SSE-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; SSE-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; SSE-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; SSE-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; SSE-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; SSE-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; SSE-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; SSE-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; SSE-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; SSE-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; SSE-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; SSE-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; SSE-NEXT: store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; SSE-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 2 +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; SSE-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; SSE-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP27:%.*]] = add i32 [[TMP26]], 3 +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; SSE-NEXT: store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; SSE-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP31:%.*]] = add i32 [[TMP30]], 4 +; SSE-NEXT: store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP14]], <8 x i32> +; AVX-NEXT: [[TMP20:%.*]] = add <8 x i32> [[TMP19]], +; AVX-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 ; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> +; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> +; AVX2-NEXT: [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> , <2 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> +; AVX2-NEXT: [[TMP20:%.*]] = add <8 x i32> [[TMP19]], +; AVX2-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @gather_load_3( -; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]] -; AVX512-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 %4 = add i32 %3, 1 @@ -315,13 +258,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) { ; SSE-LABEL: @gather_load_4( -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 @@ -329,130 +269,76 @@ ; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 ; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; SSE-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; SSE-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]] -; SSE-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]] -; SSE-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]] -; SSE-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]] -; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; SSE-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; SSE-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; SSE-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; SSE-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; SSE-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], ; SSE-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 ; SSE-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 ; SSE-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 ; SSE-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; SSE-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 -; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]] -; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]] -; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]] -; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]] -; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP5]], <8 x i32> +; AVX-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> +; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> , <2 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1 +; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], +; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] -; AVX512-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11 @@ -509,21 +395,21 @@ ; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1 ; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2 ; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3 -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer ; SSE-NEXT: [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]] ; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 ; SSE-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]] ; SSE-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( @@ -542,13 +428,13 @@ ; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5 ; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6 ; AVX-NEXT: [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7 -; AVX-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer ; AVX-NEXT: [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> -; AVX-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]] ; AVX-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]] +; AVX-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( @@ -567,13 +453,13 @@ ; AVX2-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5 ; AVX2-NEXT: [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6 ; AVX2-NEXT: [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7 -; AVX2-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> -; AVX2-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]] ; AVX2-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]] +; AVX2-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_div( @@ -592,13 +478,13 @@ ; AVX512-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5 ; AVX512-NEXT: [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6 ; AVX512-NEXT: [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7 -; AVX512-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX512-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512-NEXT: [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> -; AVX512-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX512-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]] ; AVX512-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]] +; AVX512-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -8,19 +8,19 @@ define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; CHECK-LABEL: @gather_load( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]] +; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 @@ -46,66 +46,66 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_2( ; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]] +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 ; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; SSE-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 ; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; SSE-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 ; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; SSE-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; SSE-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]] +; SSE-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]] +; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]] +; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 ; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0:!tbaa !.*]] +; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], ; AVX2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]] +; AVX2-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_2( ; AVX512-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 ; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX512-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0:!tbaa !.*]] +; AVX512-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], ; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]] +; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 @@ -133,144 +133,87 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_3( -; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; SSE-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; SSE-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; SSE-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; SSE-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; SSE-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; SSE-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; SSE-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; SSE-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; SSE-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; SSE-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; SSE-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; SSE-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; SSE-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; SSE-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; SSE-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], +; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; SSE-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; SSE-NEXT: store i32 [[TMP19]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; SSE-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 2 +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; SSE-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; SSE-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP27:%.*]] = add i32 [[TMP26]], 3 +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; SSE-NEXT: store i32 [[TMP27]], i32* [[TMP24]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; SSE-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP31:%.*]] = add i32 [[TMP30]], 4 +; SSE-NEXT: store i32 [[TMP31]], i32* [[TMP28]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP10]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP12:%.*]] = getelementptr i32, <4 x i32*> [[TMP11]], <4 x i64> +; AVX-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP12]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP14]], <8 x i32> +; AVX-NEXT: [[TMP20:%.*]] = add <8 x i32> [[TMP19]], +; AVX-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 ; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> +; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP11]], <2 x i32*> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP12]], <2 x i64> +; AVX2-NEXT: [[TMP14:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> , <2 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP15]], <8 x i32> +; AVX2-NEXT: [[TMP20:%.*]] = add <8 x i32> [[TMP19]], +; AVX2-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP20]], <8 x i32>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @gather_load_3( -; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]] -; AVX512-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 %4 = add i32 %3, 1 @@ -315,13 +258,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) { ; SSE-LABEL: @gather_load_4( -; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 @@ -329,130 +269,76 @@ ; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 ; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; SSE-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; SSE-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]] -; SSE-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]] -; SSE-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]] -; SSE-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]] -; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; SSE-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; SSE-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; SSE-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; SSE-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; SSE-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], ; SSE-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 ; SSE-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 ; SSE-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 ; SSE-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; SSE-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; SSE-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 -; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]] -; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]] -; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]] -; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]] -; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP5]], <8 x i32> +; AVX-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> undef, <2 x i32> zeroinitializer +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> +; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> , <2 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T7]], i32 1 +; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], +; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP15]], <8 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void -; -; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] -; AVX512-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11 @@ -509,21 +395,21 @@ ; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1 ; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2 ; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3 -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer ; SSE-NEXT: [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]] ; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 ; SSE-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> , <4 x float> undef), [[TBAA0]] +; SSE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]] ; SSE-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( @@ -542,13 +428,13 @@ ; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5 ; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6 ; AVX-NEXT: [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7 -; AVX-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer ; AVX-NEXT: [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> -; AVX-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]] ; AVX-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]] +; AVX-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( @@ -567,13 +453,13 @@ ; AVX2-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5 ; AVX2-NEXT: [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6 ; AVX2-NEXT: [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7 -; AVX2-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> -; AVX2-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX2-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]] ; AVX2-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]] +; AVX2-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_div( @@ -592,13 +478,13 @@ ; AVX512-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5 ; AVX512-NEXT: [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6 ; AVX512-NEXT: [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7 -; AVX512-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX512-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512-NEXT: [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> -; AVX512-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), [[TBAA0]] +; AVX512-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] ; AVX512-NEXT: [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]] ; AVX512-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]] +; AVX512-NEXT: store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 Index: llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -340,146 +340,169 @@ define void @ashr_v32i16() { ; SSE-LABEL: @ashr_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = ashr i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = ashr i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = ashr i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = ashr i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = ashr i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = ashr i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = ashr i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = ashr i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = ashr i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = ashr i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = ashr i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = ashr i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = ashr i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = ashr i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = ashr i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = ashr i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = ashr i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = ashr i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = ashr i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = ashr i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = ashr i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = ashr i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = ashr i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = ashr i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = ashr i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = ashr i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = ashr i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = ashr i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = ashr i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = ashr i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = ashr i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = ashr i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[R0:%.*]] = ashr i16 [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[R1:%.*]] = ashr i16 [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; SSE-NEXT: [[R2:%.*]] = ashr i16 [[TMP13]], [[TMP14]] +; SSE-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; SSE-NEXT: [[R3:%.*]] = ashr i16 [[TMP15]], [[TMP16]] +; SSE-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SSE-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; SSE-NEXT: [[R4:%.*]] = ashr i16 [[TMP17]], [[TMP18]] +; SSE-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; SSE-NEXT: [[R5:%.*]] = ashr i16 [[TMP19]], [[TMP20]] +; SSE-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; SSE-NEXT: [[R6:%.*]] = ashr i16 [[TMP21]], [[TMP22]] +; SSE-NEXT: [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; SSE-NEXT: [[R7:%.*]] = ashr i16 [[TMP23]], [[TMP24]] +; SSE-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[R8:%.*]] = ashr i16 [[TMP25]], [[TMP26]] +; SSE-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[R9:%.*]] = ashr i16 [[TMP27]], [[TMP28]] +; SSE-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; SSE-NEXT: [[R10:%.*]] = ashr i16 [[TMP29]], [[TMP30]] +; SSE-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; SSE-NEXT: [[R11:%.*]] = ashr i16 [[TMP31]], [[TMP32]] +; SSE-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SSE-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; SSE-NEXT: [[R12:%.*]] = ashr i16 [[TMP33]], [[TMP34]] +; SSE-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SSE-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; SSE-NEXT: [[R13:%.*]] = ashr i16 [[TMP35]], [[TMP36]] +; SSE-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; SSE-NEXT: [[R14:%.*]] = ashr i16 [[TMP37]], [[TMP38]] +; SSE-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; SSE-NEXT: [[R15:%.*]] = ashr i16 [[TMP39]], [[TMP40]] +; SSE-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[R16:%.*]] = ashr i16 [[TMP41]], [[TMP42]] +; SSE-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[R17:%.*]] = ashr i16 [[TMP43]], [[TMP44]] +; SSE-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; SSE-NEXT: [[R18:%.*]] = ashr i16 [[TMP45]], [[TMP46]] +; SSE-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; SSE-NEXT: [[R19:%.*]] = ashr i16 [[TMP47]], [[TMP48]] +; SSE-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; SSE-NEXT: [[R20:%.*]] = ashr i16 [[TMP49]], [[TMP50]] +; SSE-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; SSE-NEXT: [[R21:%.*]] = ashr i16 [[TMP51]], [[TMP52]] +; SSE-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; SSE-NEXT: [[R22:%.*]] = ashr i16 [[TMP53]], [[TMP54]] +; SSE-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; SSE-NEXT: [[R23:%.*]] = ashr i16 [[TMP55]], [[TMP56]] +; SSE-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[R24:%.*]] = ashr i16 [[TMP57]], [[TMP58]] +; SSE-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[R25:%.*]] = ashr i16 [[TMP59]], [[TMP60]] +; SSE-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2 +; SSE-NEXT: [[R26:%.*]] = ashr i16 [[TMP61]], [[TMP62]] +; SSE-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3 +; SSE-NEXT: [[R27:%.*]] = ashr i16 [[TMP63]], [[TMP64]] +; SSE-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4 +; SSE-NEXT: [[R28:%.*]] = ashr i16 [[TMP65]], [[TMP66]] +; SSE-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5 +; SSE-NEXT: [[R29:%.*]] = ashr i16 [[TMP67]], [[TMP68]] +; SSE-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +; SSE-NEXT: [[R30:%.*]] = ashr i16 [[TMP69]], [[TMP70]] +; SSE-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +; SSE-NEXT: [[R31:%.*]] = ashr i16 [[TMP71]], [[TMP72]] +; SSE-NEXT: [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0 +; SSE-NEXT: [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1 +; SSE-NEXT: [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2 +; SSE-NEXT: [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3 +; SSE-NEXT: [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4 +; SSE-NEXT: [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5 +; SSE-NEXT: [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6 +; SSE-NEXT: [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0 +; SSE-NEXT: [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1 +; SSE-NEXT: [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2 +; SSE-NEXT: [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3 +; SSE-NEXT: [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4 +; SSE-NEXT: [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5 +; SSE-NEXT: [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6 +; SSE-NEXT: [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0 +; SSE-NEXT: [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1 +; SSE-NEXT: [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2 +; SSE-NEXT: [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3 +; SSE-NEXT: [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4 +; SSE-NEXT: [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5 +; SSE-NEXT: [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6 +; SSE-NEXT: [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0 +; SSE-NEXT: [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1 +; SSE-NEXT: [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2 +; SSE-NEXT: [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3 +; SSE-NEXT: [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4 +; SSE-NEXT: [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5 +; SSE-NEXT: [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6 +; SSE-NEXT: [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; -; AVX-LABEL: @ashr_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 -; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: ret void +; AVX1-LABEL: @ashr_v32i16( +; AVX1-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX1-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX1-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX1-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX1-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @ashr_v32i16( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX2-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @ashr_v32i16( ; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2 @@ -499,6 +522,16 @@ ; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; XOP-NEXT: ret void ; +; AVX-LABEL: @ashr_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 @@ -650,16 +683,27 @@ ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; -; AVX-LABEL: @ashr_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 -; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: ret void +; AVX1-LABEL: @ashr_v64i8( +; AVX1-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX1-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX1-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX1-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX1-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @ashr_v64i8( +; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX2-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @ashr_v64i8( ; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1 @@ -679,6 +723,16 @@ ; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; XOP-NEXT: ret void ; +; AVX-LABEL: @ashr_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: ret void %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 Index: llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -289,134 +289,146 @@ define void @lshr_v32i16() { ; SSE-LABEL: @lshr_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = lshr i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = lshr i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = lshr i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = lshr i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = lshr i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = lshr i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = lshr i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = lshr i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = lshr i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = lshr i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = lshr i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = lshr i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = lshr i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = lshr i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = lshr i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = lshr i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = lshr i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = lshr i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = lshr i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = lshr i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = lshr i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = lshr i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = lshr i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = lshr i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = lshr i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = lshr i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = lshr i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = lshr i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = lshr i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = lshr i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = lshr i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = lshr i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[R0:%.*]] = lshr i16 [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[R1:%.*]] = lshr i16 [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; SSE-NEXT: [[R2:%.*]] = lshr i16 [[TMP13]], [[TMP14]] +; SSE-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; SSE-NEXT: [[R3:%.*]] = lshr i16 [[TMP15]], [[TMP16]] +; SSE-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SSE-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; SSE-NEXT: [[R4:%.*]] = lshr i16 [[TMP17]], [[TMP18]] +; SSE-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; SSE-NEXT: [[R5:%.*]] = lshr i16 [[TMP19]], [[TMP20]] +; SSE-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; SSE-NEXT: [[R6:%.*]] = lshr i16 [[TMP21]], [[TMP22]] +; SSE-NEXT: [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; SSE-NEXT: [[R7:%.*]] = lshr i16 [[TMP23]], [[TMP24]] +; SSE-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[R8:%.*]] = lshr i16 [[TMP25]], [[TMP26]] +; SSE-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[R9:%.*]] = lshr i16 [[TMP27]], [[TMP28]] +; SSE-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; SSE-NEXT: [[R10:%.*]] = lshr i16 [[TMP29]], [[TMP30]] +; SSE-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; SSE-NEXT: [[R11:%.*]] = lshr i16 [[TMP31]], [[TMP32]] +; SSE-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SSE-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; SSE-NEXT: [[R12:%.*]] = lshr i16 [[TMP33]], [[TMP34]] +; SSE-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SSE-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; SSE-NEXT: [[R13:%.*]] = lshr i16 [[TMP35]], [[TMP36]] +; SSE-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; SSE-NEXT: [[R14:%.*]] = lshr i16 [[TMP37]], [[TMP38]] +; SSE-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; SSE-NEXT: [[R15:%.*]] = lshr i16 [[TMP39]], [[TMP40]] +; SSE-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[R16:%.*]] = lshr i16 [[TMP41]], [[TMP42]] +; SSE-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[R17:%.*]] = lshr i16 [[TMP43]], [[TMP44]] +; SSE-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; SSE-NEXT: [[R18:%.*]] = lshr i16 [[TMP45]], [[TMP46]] +; SSE-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; SSE-NEXT: [[R19:%.*]] = lshr i16 [[TMP47]], [[TMP48]] +; SSE-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; SSE-NEXT: [[R20:%.*]] = lshr i16 [[TMP49]], [[TMP50]] +; SSE-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; SSE-NEXT: [[R21:%.*]] = lshr i16 [[TMP51]], [[TMP52]] +; SSE-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; SSE-NEXT: [[R22:%.*]] = lshr i16 [[TMP53]], [[TMP54]] +; SSE-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; SSE-NEXT: [[R23:%.*]] = lshr i16 [[TMP55]], [[TMP56]] +; SSE-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[R24:%.*]] = lshr i16 [[TMP57]], [[TMP58]] +; SSE-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[R25:%.*]] = lshr i16 [[TMP59]], [[TMP60]] +; SSE-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2 +; SSE-NEXT: [[R26:%.*]] = lshr i16 [[TMP61]], [[TMP62]] +; SSE-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3 +; SSE-NEXT: [[R27:%.*]] = lshr i16 [[TMP63]], [[TMP64]] +; SSE-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4 +; SSE-NEXT: [[R28:%.*]] = lshr i16 [[TMP65]], [[TMP66]] +; SSE-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5 +; SSE-NEXT: [[R29:%.*]] = lshr i16 [[TMP67]], [[TMP68]] +; SSE-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +; SSE-NEXT: [[R30:%.*]] = lshr i16 [[TMP69]], [[TMP70]] +; SSE-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +; SSE-NEXT: [[R31:%.*]] = lshr i16 [[TMP71]], [[TMP72]] +; SSE-NEXT: [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0 +; SSE-NEXT: [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1 +; SSE-NEXT: [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2 +; SSE-NEXT: [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3 +; SSE-NEXT: [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4 +; SSE-NEXT: [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5 +; SSE-NEXT: [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6 +; SSE-NEXT: [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0 +; SSE-NEXT: [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1 +; SSE-NEXT: [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2 +; SSE-NEXT: [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3 +; SSE-NEXT: [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4 +; SSE-NEXT: [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5 +; SSE-NEXT: [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6 +; SSE-NEXT: [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0 +; SSE-NEXT: [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1 +; SSE-NEXT: [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2 +; SSE-NEXT: [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3 +; SSE-NEXT: [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4 +; SSE-NEXT: [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5 +; SSE-NEXT: [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6 +; SSE-NEXT: [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0 +; SSE-NEXT: [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1 +; SSE-NEXT: [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2 +; SSE-NEXT: [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3 +; SSE-NEXT: [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4 +; SSE-NEXT: [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5 +; SSE-NEXT: [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6 +; SSE-NEXT: [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v32i16( Index: llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -241,134 +241,146 @@ define void @shl_v32i16() { ; SSE-LABEL: @shl_v32i16( -; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 -; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 -; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 -; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 -; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 -; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 -; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 -; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 -; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 -; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 -; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 -; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 -; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 -; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 -; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 -; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 -; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 -; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 -; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 -; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 -; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 -; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 -; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 -; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 -; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 -; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 -; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 -; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 -; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 -; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 -; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 -; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 -; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 -; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 -; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 -; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 -; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 -; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 -; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 -; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 -; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 -; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 -; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 -; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 -; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 -; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 -; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 -; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 -; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 -; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 -; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 -; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 -; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 -; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 -; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 -; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 -; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 -; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 -; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 -; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 -; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 -; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 -; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 -; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 -; SSE-NEXT: [[R0:%.*]] = shl i16 [[A0]], [[B0]] -; SSE-NEXT: [[R1:%.*]] = shl i16 [[A1]], [[B1]] -; SSE-NEXT: [[R2:%.*]] = shl i16 [[A2]], [[B2]] -; SSE-NEXT: [[R3:%.*]] = shl i16 [[A3]], [[B3]] -; SSE-NEXT: [[R4:%.*]] = shl i16 [[A4]], [[B4]] -; SSE-NEXT: [[R5:%.*]] = shl i16 [[A5]], [[B5]] -; SSE-NEXT: [[R6:%.*]] = shl i16 [[A6]], [[B6]] -; SSE-NEXT: [[R7:%.*]] = shl i16 [[A7]], [[B7]] -; SSE-NEXT: [[R8:%.*]] = shl i16 [[A8]], [[B8]] -; SSE-NEXT: [[R9:%.*]] = shl i16 [[A9]], [[B9]] -; SSE-NEXT: [[R10:%.*]] = shl i16 [[A10]], [[B10]] -; SSE-NEXT: [[R11:%.*]] = shl i16 [[A11]], [[B11]] -; SSE-NEXT: [[R12:%.*]] = shl i16 [[A12]], [[B12]] -; SSE-NEXT: [[R13:%.*]] = shl i16 [[A13]], [[B13]] -; SSE-NEXT: [[R14:%.*]] = shl i16 [[A14]], [[B14]] -; SSE-NEXT: [[R15:%.*]] = shl i16 [[A15]], [[B15]] -; SSE-NEXT: [[R16:%.*]] = shl i16 [[A16]], [[B16]] -; SSE-NEXT: [[R17:%.*]] = shl i16 [[A17]], [[B17]] -; SSE-NEXT: [[R18:%.*]] = shl i16 [[A18]], [[B18]] -; SSE-NEXT: [[R19:%.*]] = shl i16 [[A19]], [[B19]] -; SSE-NEXT: [[R20:%.*]] = shl i16 [[A20]], [[B20]] -; SSE-NEXT: [[R21:%.*]] = shl i16 [[A21]], [[B21]] -; SSE-NEXT: [[R22:%.*]] = shl i16 [[A22]], [[B22]] -; SSE-NEXT: [[R23:%.*]] = shl i16 [[A23]], [[B23]] -; SSE-NEXT: [[R24:%.*]] = shl i16 [[A24]], [[B24]] -; SSE-NEXT: [[R25:%.*]] = shl i16 [[A25]], [[B25]] -; SSE-NEXT: [[R26:%.*]] = shl i16 [[A26]], [[B26]] -; SSE-NEXT: [[R27:%.*]] = shl i16 [[A27]], [[B27]] -; SSE-NEXT: [[R28:%.*]] = shl i16 [[A28]], [[B28]] -; SSE-NEXT: [[R29:%.*]] = shl i16 [[A29]], [[B29]] -; SSE-NEXT: [[R30:%.*]] = shl i16 [[A30]], [[B30]] -; SSE-NEXT: [[R31:%.*]] = shl i16 [[A31]], [[B31]] -; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 -; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 -; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 -; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 -; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 -; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 -; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 -; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 -; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 -; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 -; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 -; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 -; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 -; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 -; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 -; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 -; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 -; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 -; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 -; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 -; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 -; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 -; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 -; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 -; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 -; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 -; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 -; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 -; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 -; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 -; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 -; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; SSE-NEXT: [[R0:%.*]] = shl i16 [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; SSE-NEXT: [[R1:%.*]] = shl i16 [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP13:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; SSE-NEXT: [[R2:%.*]] = shl i16 [[TMP13]], [[TMP14]] +; SSE-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; SSE-NEXT: [[R3:%.*]] = shl i16 [[TMP15]], [[TMP16]] +; SSE-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SSE-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; SSE-NEXT: [[R4:%.*]] = shl i16 [[TMP17]], [[TMP18]] +; SSE-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SSE-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; SSE-NEXT: [[R5:%.*]] = shl i16 [[TMP19]], [[TMP20]] +; SSE-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SSE-NEXT: [[TMP22:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; SSE-NEXT: [[R6:%.*]] = shl i16 [[TMP21]], [[TMP22]] +; SSE-NEXT: [[TMP23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SSE-NEXT: [[TMP24:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; SSE-NEXT: [[R7:%.*]] = shl i16 [[TMP23]], [[TMP24]] +; SSE-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; SSE-NEXT: [[R8:%.*]] = shl i16 [[TMP25]], [[TMP26]] +; SSE-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; SSE-NEXT: [[R9:%.*]] = shl i16 [[TMP27]], [[TMP28]] +; SSE-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; SSE-NEXT: [[R10:%.*]] = shl i16 [[TMP29]], [[TMP30]] +; SSE-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; SSE-NEXT: [[R11:%.*]] = shl i16 [[TMP31]], [[TMP32]] +; SSE-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SSE-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; SSE-NEXT: [[R12:%.*]] = shl i16 [[TMP33]], [[TMP34]] +; SSE-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SSE-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; SSE-NEXT: [[R13:%.*]] = shl i16 [[TMP35]], [[TMP36]] +; SSE-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; SSE-NEXT: [[R14:%.*]] = shl i16 [[TMP37]], [[TMP38]] +; SSE-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SSE-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; SSE-NEXT: [[R15:%.*]] = shl i16 [[TMP39]], [[TMP40]] +; SSE-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; SSE-NEXT: [[R16:%.*]] = shl i16 [[TMP41]], [[TMP42]] +; SSE-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; SSE-NEXT: [[R17:%.*]] = shl i16 [[TMP43]], [[TMP44]] +; SSE-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; SSE-NEXT: [[R18:%.*]] = shl i16 [[TMP45]], [[TMP46]] +; SSE-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; SSE-NEXT: [[R19:%.*]] = shl i16 [[TMP47]], [[TMP48]] +; SSE-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; SSE-NEXT: [[R20:%.*]] = shl i16 [[TMP49]], [[TMP50]] +; SSE-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; SSE-NEXT: [[R21:%.*]] = shl i16 [[TMP51]], [[TMP52]] +; SSE-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; SSE-NEXT: [[R22:%.*]] = shl i16 [[TMP53]], [[TMP54]] +; SSE-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; SSE-NEXT: [[R23:%.*]] = shl i16 [[TMP55]], [[TMP56]] +; SSE-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP8]], i32 0 +; SSE-NEXT: [[R24:%.*]] = shl i16 [[TMP57]], [[TMP58]] +; SSE-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP8]], i32 1 +; SSE-NEXT: [[R25:%.*]] = shl i16 [[TMP59]], [[TMP60]] +; SSE-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP8]], i32 2 +; SSE-NEXT: [[R26:%.*]] = shl i16 [[TMP61]], [[TMP62]] +; SSE-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP8]], i32 3 +; SSE-NEXT: [[R27:%.*]] = shl i16 [[TMP63]], [[TMP64]] +; SSE-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; SSE-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP8]], i32 4 +; SSE-NEXT: [[R28:%.*]] = shl i16 [[TMP65]], [[TMP66]] +; SSE-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; SSE-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP8]], i32 5 +; SSE-NEXT: [[R29:%.*]] = shl i16 [[TMP67]], [[TMP68]] +; SSE-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; SSE-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +; SSE-NEXT: [[R30:%.*]] = shl i16 [[TMP69]], [[TMP70]] +; SSE-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; SSE-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +; SSE-NEXT: [[R31:%.*]] = shl i16 [[TMP71]], [[TMP72]] +; SSE-NEXT: [[TMP73:%.*]] = insertelement <8 x i16> poison, i16 [[R0]], i32 0 +; SSE-NEXT: [[TMP74:%.*]] = insertelement <8 x i16> [[TMP73]], i16 [[R1]], i32 1 +; SSE-NEXT: [[TMP75:%.*]] = insertelement <8 x i16> [[TMP74]], i16 [[R2]], i32 2 +; SSE-NEXT: [[TMP76:%.*]] = insertelement <8 x i16> [[TMP75]], i16 [[R3]], i32 3 +; SSE-NEXT: [[TMP77:%.*]] = insertelement <8 x i16> [[TMP76]], i16 [[R4]], i32 4 +; SSE-NEXT: [[TMP78:%.*]] = insertelement <8 x i16> [[TMP77]], i16 [[R5]], i32 5 +; SSE-NEXT: [[TMP79:%.*]] = insertelement <8 x i16> [[TMP78]], i16 [[R6]], i32 6 +; SSE-NEXT: [[TMP80:%.*]] = insertelement <8 x i16> [[TMP79]], i16 [[R7]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP80]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP81:%.*]] = insertelement <8 x i16> poison, i16 [[R8]], i32 0 +; SSE-NEXT: [[TMP82:%.*]] = insertelement <8 x i16> [[TMP81]], i16 [[R9]], i32 1 +; SSE-NEXT: [[TMP83:%.*]] = insertelement <8 x i16> [[TMP82]], i16 [[R10]], i32 2 +; SSE-NEXT: [[TMP84:%.*]] = insertelement <8 x i16> [[TMP83]], i16 [[R11]], i32 3 +; SSE-NEXT: [[TMP85:%.*]] = insertelement <8 x i16> [[TMP84]], i16 [[R12]], i32 4 +; SSE-NEXT: [[TMP86:%.*]] = insertelement <8 x i16> [[TMP85]], i16 [[R13]], i32 5 +; SSE-NEXT: [[TMP87:%.*]] = insertelement <8 x i16> [[TMP86]], i16 [[R14]], i32 6 +; SSE-NEXT: [[TMP88:%.*]] = insertelement <8 x i16> [[TMP87]], i16 [[R15]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP88]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP89:%.*]] = insertelement <8 x i16> poison, i16 [[R16]], i32 0 +; SSE-NEXT: [[TMP90:%.*]] = insertelement <8 x i16> [[TMP89]], i16 [[R17]], i32 1 +; SSE-NEXT: [[TMP91:%.*]] = insertelement <8 x i16> [[TMP90]], i16 [[R18]], i32 2 +; SSE-NEXT: [[TMP92:%.*]] = insertelement <8 x i16> [[TMP91]], i16 [[R19]], i32 3 +; SSE-NEXT: [[TMP93:%.*]] = insertelement <8 x i16> [[TMP92]], i16 [[R20]], i32 4 +; SSE-NEXT: [[TMP94:%.*]] = insertelement <8 x i16> [[TMP93]], i16 [[R21]], i32 5 +; SSE-NEXT: [[TMP95:%.*]] = insertelement <8 x i16> [[TMP94]], i16 [[R22]], i32 6 +; SSE-NEXT: [[TMP96:%.*]] = insertelement <8 x i16> [[TMP95]], i16 [[R23]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP96]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP97:%.*]] = insertelement <8 x i16> poison, i16 [[R24]], i32 0 +; SSE-NEXT: [[TMP98:%.*]] = insertelement <8 x i16> [[TMP97]], i16 [[R25]], i32 1 +; SSE-NEXT: [[TMP99:%.*]] = insertelement <8 x i16> [[TMP98]], i16 [[R26]], i32 2 +; SSE-NEXT: [[TMP100:%.*]] = insertelement <8 x i16> [[TMP99]], i16 [[R27]], i32 3 +; SSE-NEXT: [[TMP101:%.*]] = insertelement <8 x i16> [[TMP100]], i16 [[R28]], i32 4 +; SSE-NEXT: [[TMP102:%.*]] = insertelement <8 x i16> [[TMP101]], i16 [[R29]], i32 5 +; SSE-NEXT: [[TMP103:%.*]] = insertelement <8 x i16> [[TMP102]], i16 [[R30]], i32 6 +; SSE-NEXT: [[TMP104:%.*]] = insertelement <8 x i16> [[TMP103]], i16 [[R31]], i32 7 +; SSE-NEXT: store <8 x i16> [[TMP104]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v32i16( Index: llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll +++ llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -70,9 +70,10 @@ ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[MUL5:%.*]] = fmul double [[I3]], [[I4]] -; CHECK-NEXT: store double [[MUL]], double* [[C:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1 -; CHECK-NEXT: store double [[MUL5]], double* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL5]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %i0 = load volatile double, double* %a, align 8 Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -702,33 +702,6 @@ ; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @sitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = sitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: ret void -; ; AVX512-LABEL: @sitofp_8i64_8f32( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 ; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> @@ -917,18 +890,20 @@ define void @sitofp_4i16_4f32() #0 { ; SSE-LABEL: @sitofp_4i16_4f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[TMP2]] to float +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[TMP3]] to float +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[TMP4]] to float +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i16_4f32( @@ -954,30 +929,34 @@ define void @sitofp_8i16_8f32() #0 { ; SSE-LABEL: @sitofp_8i16_8f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[TMP3]] to float +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[TMP4]] to float +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[TMP6]] to float +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[TMP7]] to float +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[TMP8]] to float +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[TMP9]] to float +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[TMP10]] to float +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_8i16_8f32( @@ -1015,54 +994,62 @@ define void @sitofp_16i16_16f32() #0 { ; SSE-LABEL: @sitofp_16i16_16f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16 -; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2 -; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4 -; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2 -; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8 -; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2 -; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4 -; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float -; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[LD8]] to float -; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[LD9]] to float -; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[LD10]] to float -; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[LD11]] to float -; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[LD12]] to float -; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[LD13]] to float -; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[LD14]] to float -; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[LD15]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32 -; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8 -; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16 -; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8 -; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[TMP6]] to float +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[TMP7]] to float +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[TMP8]] to float +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[TMP9]] to float +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[TMP10]] to float +; SSE-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[TMP11]] to float +; SSE-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[TMP12]] to float +; SSE-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[TMP13]] to float +; SSE-NEXT: [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[TMP14]] to float +; SSE-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[TMP15]] to float +; SSE-NEXT: [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[TMP16]] to float +; SSE-NEXT: [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[TMP17]] to float +; SSE-NEXT: [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[TMP18]] to float +; SSE-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[TMP19]] to float +; SSE-NEXT: [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[TMP20]] to float +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i16_16f32( Index: llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -702,33 +702,6 @@ ; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @sitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = sitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = sitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = sitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = sitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = sitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = sitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: ret void -; ; AVX512-LABEL: @sitofp_8i64_8f32( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 ; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float> @@ -917,18 +890,20 @@ define void @sitofp_4i16_4f32() #0 { ; SSE-LABEL: @sitofp_4i16_4f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[TMP2]] to float +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[TMP3]] to float +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[TMP4]] to float +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_4i16_4f32( @@ -954,30 +929,34 @@ define void @sitofp_8i16_8f32() #0 { ; SSE-LABEL: @sitofp_8i16_8f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[TMP3]] to float +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[TMP4]] to float +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[TMP6]] to float +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[TMP7]] to float +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[TMP8]] to float +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[TMP9]] to float +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[TMP10]] to float +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sitofp_8i16_8f32( @@ -1015,54 +994,62 @@ define void @sitofp_16i16_16f32() #0 { ; SSE-LABEL: @sitofp_16i16_16f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16 -; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2 -; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4 -; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2 -; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8 -; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2 -; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4 -; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float -; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[LD8]] to float -; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[LD9]] to float -; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[LD10]] to float -; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[LD11]] to float -; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[LD12]] to float -; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[LD13]] to float -; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[LD14]] to float -; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[LD15]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32 -; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8 -; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16 -; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8 -; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[TMP6]] to float +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[TMP7]] to float +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[TMP8]] to float +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[TMP9]] to float +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[TMP10]] to float +; SSE-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[TMP11]] to float +; SSE-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[TMP12]] to float +; SSE-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[TMP13]] to float +; SSE-NEXT: [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[TMP14]] to float +; SSE-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[TMP15]] to float +; SSE-NEXT: [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[TMP16]] to float +; SSE-NEXT: [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[TMP17]] to float +; SSE-NEXT: [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[TMP18]] to float +; SSE-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[TMP19]] to float +; SSE-NEXT: [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[TMP20]] to float +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i16_16f32( Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -119,11 +119,13 @@ ; CHECK-NEXT: [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[SRC_ADDR_013]], align 8 -; CHECK-NEXT: store double [[TMP0]], double* [[DST_ADDR_014]], align 8 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1 -; CHECK-NEXT: store double [[TMP1]], double* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]] ; CHECK-NEXT: [[ADD_PTR4]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 [[I_015]] ; CHECK-NEXT: [[INC]] = add i64 [[I_015]], 1 @@ -166,19 +168,21 @@ ; CHECK-NEXT: [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4 -; CHECK-NEXT: store float [[TMP0]], float* [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 -; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 -; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 Index: llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ_I7 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ_BDVER1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ_AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ @@ -135,14 +135,32 @@ ; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @uitofp_2i32_2f64( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: ret void +; AVX256NODQ_I7-LABEL: @uitofp_2i32_2f64( +; AVX256NODQ_I7-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 +; AVX256NODQ_I7-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double +; AVX256NODQ_I7-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double +; AVX256NODQ_I7-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ_I7-NEXT: ret void +; +; AVX256NODQ_BDVER1-LABEL: @uitofp_2i32_2f64( +; AVX256NODQ_BDVER1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 +; AVX256NODQ_BDVER1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 +; AVX256NODQ_BDVER1-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double +; AVX256NODQ_BDVER1-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double +; AVX256NODQ_BDVER1-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ_BDVER1-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ_BDVER1-NEXT: ret void +; +; AVX256NODQ_AVX2-LABEL: @uitofp_2i32_2f64( +; AVX256NODQ_AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 +; AVX256NODQ_AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 +; AVX256NODQ_AVX2-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double +; AVX256NODQ_AVX2-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double +; AVX256NODQ_AVX2-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ_AVX2-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ_AVX2-NEXT: ret void ; ; AVX512-LABEL: @uitofp_2i32_2f64( ; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 @@ -156,6 +174,14 @@ ; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256DQ-NEXT: ret void ; +; AVX256NODQ-LABEL: @uitofp_2i32_2f64( +; AVX256NODQ-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i32 [[LD0]] to double +; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i32 [[LD1]] to double +; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: ret void %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64 %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4 %cvt0 = uitofp i32 %ld0 to double @@ -408,14 +434,32 @@ ; SSE-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @uitofp_2i8_2f64( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double -; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: ret void +; AVX256NODQ_I7-LABEL: @uitofp_2i8_2f64( +; AVX256NODQ_I7-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX256NODQ_I7-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double +; AVX256NODQ_I7-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double +; AVX256NODQ_I7-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ_I7-NEXT: ret void +; +; AVX256NODQ_BDVER1-LABEL: @uitofp_2i8_2f64( +; AVX256NODQ_BDVER1-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX256NODQ_BDVER1-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX256NODQ_BDVER1-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double +; AVX256NODQ_BDVER1-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double +; AVX256NODQ_BDVER1-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ_BDVER1-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ_BDVER1-NEXT: ret void +; +; AVX256NODQ_AVX2-LABEL: @uitofp_2i8_2f64( +; AVX256NODQ_AVX2-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX256NODQ_AVX2-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX256NODQ_AVX2-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double +; AVX256NODQ_AVX2-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double +; AVX256NODQ_AVX2-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ_AVX2-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ_AVX2-NEXT: ret void ; ; AVX512-LABEL: @uitofp_2i8_2f64( ; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 @@ -429,6 +473,14 @@ ; AVX256DQ-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 ; AVX256DQ-NEXT: ret void ; +; AVX256NODQ-LABEL: @uitofp_2i8_2f64( +; AVX256NODQ-NEXT: [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i8 [[LD0]] to double +; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i8 [[LD1]] to double +; AVX256NODQ-NEXT: store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: ret void %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64 %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1 %cvt0 = uitofp i8 %ld0 to double @@ -584,6 +636,63 @@ ; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 ; SSE-NEXT: ret void ; +; AVX256NODQ_I7-LABEL: @uitofp_4i64_4f32( +; AVX256NODQ_I7-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256NODQ_I7-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; AVX256NODQ_I7-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; AVX256NODQ_I7-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; AVX256NODQ_I7-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; AVX256NODQ_I7-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float +; AVX256NODQ_I7-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float +; AVX256NODQ_I7-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256NODQ_I7-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; AVX256NODQ_I7-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ_I7-NEXT: ret void +; +; AVX256NODQ_BDVER1-LABEL: @uitofp_4i64_4f32( +; AVX256NODQ_BDVER1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256NODQ_BDVER1-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256NODQ_BDVER1-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; AVX256NODQ_BDVER1-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; AVX256NODQ_BDVER1-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; AVX256NODQ_BDVER1-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; AVX256NODQ_BDVER1-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float +; AVX256NODQ_BDVER1-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float +; AVX256NODQ_BDVER1-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256NODQ_BDVER1-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256NODQ_BDVER1-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; AVX256NODQ_BDVER1-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ_BDVER1-NEXT: ret void +; +; AVX256NODQ_AVX2-LABEL: @uitofp_4i64_4f32( +; AVX256NODQ_AVX2-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256NODQ_AVX2-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256NODQ_AVX2-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; AVX256NODQ_AVX2-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; AVX256NODQ_AVX2-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; AVX256NODQ_AVX2-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; AVX256NODQ_AVX2-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float +; AVX256NODQ_AVX2-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float +; AVX256NODQ_AVX2-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256NODQ_AVX2-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256NODQ_AVX2-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; AVX256NODQ_AVX2-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ_AVX2-NEXT: ret void +; +; AVX512-LABEL: @uitofp_4i64_4f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX512-NEXT: ret void +; +; AVX256DQ-LABEL: @uitofp_4i64_4f32( +; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 +; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; AVX256DQ-NEXT: ret void +; ; AVX256NODQ-LABEL: @uitofp_4i64_4f32( ; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 ; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 @@ -598,19 +707,6 @@ ; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 ; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 ; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_4i64_4f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX512-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_4i64_4f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; AVX256DQ-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; AVX256DQ-NEXT: ret void -; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 @@ -654,32 +750,76 @@ ; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @uitofp_8i64_8f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 -; AVX256NODQ-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 -; AVX256NODQ-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 -; AVX256NODQ-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float -; AVX256NODQ-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float -; AVX256NODQ-NEXT: [[CVT4:%.*]] = uitofp i64 [[LD4]] to float -; AVX256NODQ-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float -; AVX256NODQ-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float -; AVX256NODQ-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; AVX256NODQ-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: ret void +; AVX256NODQ_I7-LABEL: @uitofp_8i64_8f32( +; AVX256NODQ_I7-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256NODQ_I7-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16 +; AVX256NODQ_I7-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8 +; AVX256NODQ_I7-NEXT: [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32 +; AVX256NODQ_I7-NEXT: [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8 +; AVX256NODQ_I7-NEXT: [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16 +; AVX256NODQ_I7-NEXT: [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8 +; AVX256NODQ_I7-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; AVX256NODQ_I7-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; AVX256NODQ_I7-NEXT: [[CVT2:%.*]] = uitofp i64 [[LD2]] to float +; AVX256NODQ_I7-NEXT: [[CVT3:%.*]] = uitofp i64 [[LD3]] to float +; AVX256NODQ_I7-NEXT: [[CVT4:%.*]] = uitofp i64 [[LD4]] to float +; AVX256NODQ_I7-NEXT: [[CVT5:%.*]] = uitofp i64 [[LD5]] to float +; AVX256NODQ_I7-NEXT: [[CVT6:%.*]] = uitofp i64 [[LD6]] to float +; AVX256NODQ_I7-NEXT: [[CVT7:%.*]] = uitofp i64 [[LD7]] to float +; AVX256NODQ_I7-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256NODQ_I7-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256NODQ_I7-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 +; AVX256NODQ_I7-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; AVX256NODQ_I7-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 +; AVX256NODQ_I7-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 +; AVX256NODQ_I7-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 +; AVX256NODQ_I7-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; AVX256NODQ_I7-NEXT: ret void +; +; AVX256NODQ_BDVER1-LABEL: @uitofp_8i64_8f32( +; AVX256NODQ_BDVER1-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX256NODQ_BDVER1-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> +; AVX256NODQ_BDVER1-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7 +; AVX256NODQ_BDVER1-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0 +; AVX256NODQ_BDVER1-NEXT: [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0 +; AVX256NODQ_BDVER1-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1 +; AVX256NODQ_BDVER1-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1 +; AVX256NODQ_BDVER1-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2 +; AVX256NODQ_BDVER1-NEXT: [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2 +; AVX256NODQ_BDVER1-NEXT: [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3 +; AVX256NODQ_BDVER1-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3 +; AVX256NODQ_BDVER1-NEXT: [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4 +; AVX256NODQ_BDVER1-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4 +; AVX256NODQ_BDVER1-NEXT: [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; AVX256NODQ_BDVER1-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5 +; AVX256NODQ_BDVER1-NEXT: [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; AVX256NODQ_BDVER1-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6 +; AVX256NODQ_BDVER1-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7 +; AVX256NODQ_BDVER1-NEXT: store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256NODQ_BDVER1-NEXT: ret void +; +; AVX256NODQ_AVX2-LABEL: @uitofp_8i64_8f32( +; AVX256NODQ_AVX2-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 +; AVX256NODQ_AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float> +; AVX256NODQ_AVX2-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 7 +; AVX256NODQ_AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 0 +; AVX256NODQ_AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x float> poison, float [[TMP4]], i32 0 +; AVX256NODQ_AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP2]], i32 1 +; AVX256NODQ_AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i32 1 +; AVX256NODQ_AVX2-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP2]], i32 2 +; AVX256NODQ_AVX2-NEXT: [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i32 2 +; AVX256NODQ_AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3 +; AVX256NODQ_AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 3 +; AVX256NODQ_AVX2-NEXT: [[TMP12:%.*]] = extractelement <8 x float> [[TMP2]], i32 4 +; AVX256NODQ_AVX2-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 4 +; AVX256NODQ_AVX2-NEXT: [[TMP14:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; AVX256NODQ_AVX2-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i32 5 +; AVX256NODQ_AVX2-NEXT: [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; AVX256NODQ_AVX2-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP16]], i32 6 +; AVX256NODQ_AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP3]], i32 7 +; AVX256NODQ_AVX2-NEXT: store <8 x float> [[TMP18]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256NODQ_AVX2-NEXT: ret void ; ; AVX512-LABEL: @uitofp_8i64_8f32( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64 @@ -869,18 +1009,20 @@ define void @uitofp_4i16_4f32() #0 { ; SSE-LABEL: @uitofp_4i16_4f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[TMP2]] to float +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[TMP3]] to float +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[TMP4]] to float +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_4i16_4f32( @@ -906,30 +1048,34 @@ define void @uitofp_8i16_8f32() #0 { ; SSE-LABEL: @uitofp_8i16_8f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[TMP3]] to float +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[TMP4]] to float +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[TMP6]] to float +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[TMP7]] to float +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[TMP8]] to float +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[TMP9]] to float +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[TMP10]] to float +; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP18]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @uitofp_8i16_8f32( @@ -967,54 +1113,62 @@ define void @uitofp_16i16_16f32() #0 { ; SSE-LABEL: @uitofp_16i16_16f32( -; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2 -; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2 -; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8 -; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2 -; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4 -; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2 -; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16 -; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2 -; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4 -; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2 -; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8 -; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2 -; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4 -; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float -; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float -; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float -; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float -; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float -; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float -; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float -; SSE-NEXT: [[CVT8:%.*]] = uitofp i16 [[LD8]] to float -; SSE-NEXT: [[CVT9:%.*]] = uitofp i16 [[LD9]] to float -; SSE-NEXT: [[CVT10:%.*]] = uitofp i16 [[LD10]] to float -; SSE-NEXT: [[CVT11:%.*]] = uitofp i16 [[LD11]] to float -; SSE-NEXT: [[CVT12:%.*]] = uitofp i16 [[LD12]] to float -; SSE-NEXT: [[CVT13:%.*]] = uitofp i16 [[LD13]] to float -; SSE-NEXT: [[CVT14:%.*]] = uitofp i16 [[LD14]] to float -; SSE-NEXT: [[CVT15:%.*]] = uitofp i16 [[LD15]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8 -; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16 -; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8 -; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 -; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32 -; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 -; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8 -; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 -; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16 -; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 -; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8 -; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[TMP5]] to float +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 +; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[TMP6]] to float +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[TMP7]] to float +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[TMP8]] to float +; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[TMP9]] to float +; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[TMP10]] to float +; SSE-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[TMP11]] to float +; SSE-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[TMP12]] to float +; SSE-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[CVT8:%.*]] = uitofp i16 [[TMP13]] to float +; SSE-NEXT: [[TMP14:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[CVT9:%.*]] = uitofp i16 [[TMP14]] to float +; SSE-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[CVT10:%.*]] = uitofp i16 [[TMP15]] to float +; SSE-NEXT: [[TMP16:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[CVT11:%.*]] = uitofp i16 [[TMP16]] to float +; SSE-NEXT: [[TMP17:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 +; SSE-NEXT: [[CVT12:%.*]] = uitofp i16 [[TMP17]] to float +; SSE-NEXT: [[TMP18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 +; SSE-NEXT: [[CVT13:%.*]] = uitofp i16 [[TMP18]] to float +; SSE-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; SSE-NEXT: [[CVT14:%.*]] = uitofp i16 [[TMP19]] to float +; SSE-NEXT: [[TMP20:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; SSE-NEXT: [[CVT15:%.*]] = uitofp i16 [[TMP20]] to float +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[CVT1]], i32 1 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[CVT2]], i32 2 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[CVT3]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP24]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> poison, float [[CVT4]], i32 0 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[CVT5]], i32 1 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[CVT6]], i32 2 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float [[CVT7]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP28]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <4 x float> poison, float [[CVT8]], i32 0 +; SSE-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[CVT9]], i32 1 +; SSE-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[CVT10]], i32 2 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float [[CVT11]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP32]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[CVT12]], i32 0 +; SSE-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[CVT13]], i32 1 +; SSE-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[CVT14]], i32 2 +; SSE-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[CVT15]], i32 3 +; SSE-NEXT: store <4 x float> [[TMP36]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_16i16_16f32( Index: llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll +++ llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll @@ -27,13 +27,15 @@ ; CHECK-NEXT: [[T2_0:%.*]] = phi i32 [ [[T6]], [[BB1]] ], [ 2, [[ENTRY]] ] ; CHECK-NEXT: [[T3_0:%.*]] = phi i32 [ [[T8]], [[BB1]] ], [ 2, [[ENTRY]] ] ; CHECK-NEXT: [[T4_0:%.*]] = phi i32 [ [[T10]], [[BB1]] ], [ 2, [[ENTRY]] ] -; CHECK-NEXT: store i32 [[T1_0]], i32* [[X]], align 4 ; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1 -; CHECK-NEXT: store i32 [[T2_0]], i32* [[T12]], align 4 ; CHECK-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2 -; CHECK-NEXT: store i32 [[T3_0]], i32* [[T13]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3 -; CHECK-NEXT: store i32 [[T4_0]], i32* [[T14]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[T1_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[T2_0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T3_0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T4_0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[X]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: ret void ; entry: