diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2941,6 +2941,12 @@ return ScalarToTreeEntry.lookup(V); } + /// Checks if the specified list of the instructions/values can be vectorized + /// and fills required data before actual scheduling of the instructions. + TreeEntry::EntryState getScalarsVectorizationState( + InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, + OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) const; + /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; @@ -5171,6 +5177,309 @@ const Instruction *AltOp, const TargetLibraryInfo &TLI); +BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( + InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, + OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) const { + assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); + + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); + auto *VL0 = cast(S.OpValue); + switch (ShuffleOrOp) { + case Instruction::PHI: { + // Check for terminator values (e.g. invoke). + for (Value *V : VL) + for (Value *Incoming : cast(V)->incoming_values()) { + Instruction *Term = dyn_cast(Incoming); + if (Term && Term->isTerminator()) { + LLVM_DEBUG(dbgs() + << "SLP: Need to swizzle PHINodes (terminator use).\n"); + return TreeEntry::NeedToGather; + } + } + + return TreeEntry::Vectorize; + } + case Instruction::ExtractValue: + case Instruction::ExtractElement: { + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + if (Reuse || !CurrentOrder.empty()) + return TreeEntry::Vectorize; + LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); + return TreeEntry::NeedToGather; + } + case Instruction::InsertElement: { + // Check that we have a buildvector and not a shuffle of 2 or more + // different vectors. + ValueSet SourceVectors; + for (Value *V : VL) { + SourceVectors.insert(cast(V)->getOperand(0)); + assert(getInsertIndex(V) != std::nullopt && + "Non-constant or undef index?"); + } + + if (count_if(VL, [&SourceVectors](Value *V) { + return !SourceVectors.contains(V); + }) >= 2) { + // Found 2nd source vector - cancel. + LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " + "different source vectors.\n"); + return TreeEntry::NeedToGather; + } + + return TreeEntry::Vectorize; + } + case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder, + PointerOps)) { + case LoadsState::Vectorize: + return TreeEntry::Vectorize; + case LoadsState::ScatterVectorize: + return TreeEntry::ScatterVectorize; + case LoadsState::Gather: +#ifndef NDEBUG + Type *ScalarTy = VL0->getType(); + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + else if (any_of(VL, + [](Value *V) { return !cast(V)->isSimple(); })) + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + else + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +#endif // NDEBUG + return TreeEntry::NeedToGather; + } + llvm_unreachable("Unexpected state of loads"); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (Value *V : VL) { + Type *Ty = cast(V)->getOperand(0)->getType(); + if (Ty != SrcTy || !isValidElementType(Ty)) { + LLVM_DEBUG( + dbgs() << "SLP: Gathering casts with different src types.\n"); + return TreeEntry::NeedToGather; + } + } + return TreeEntry::Vectorize; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = cast(VL0)->getPredicate(); + CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); + Type *ComparedTy = VL0->getOperand(0)->getType(); + for (Value *V : VL) { + CmpInst *Cmp = cast(V); + if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || + Cmp->getOperand(0)->getType() != ComparedTy) { + LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return TreeEntry::NeedToGather; + } + } + return TreeEntry::Vectorize; + } + case Instruction::Select: + case Instruction::FNeg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TreeEntry::Vectorize; + case Instruction::GetElementPtr: { + // We don't combine GEPs with complicated (nested) indexing. + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (I->getNumOperands() != 2) { + LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + return TreeEntry::NeedToGather; + } + } + + // We can't combine several GEPs into one vector if they operate on + // different types. + Type *Ty0 = cast(VL0)->getSourceElementType(); + for (Value *V : VL) { + auto *GEP = dyn_cast(V); + if (!GEP) + continue; + Type *CurTy = GEP->getSourceElementType(); + if (Ty0 != CurTy) { + LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + return TreeEntry::NeedToGather; + } + } + + // We don't combine GEPs with non-constant indexes. + Type *Ty1 = VL0->getOperand(1)->getType(); + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + auto *Op = I->getOperand(1); + if ((!IsScatterVectorizeUserTE && !isa(Op)) || + (Op->getType() != Ty1 && + ((IsScatterVectorizeUserTE && !isa(Op)) || + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace())))) { + LLVM_DEBUG( + dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + return TreeEntry::NeedToGather; + } + } + + return TreeEntry::Vectorize; + } + case Instruction::Store: { + // Check if the stores are consecutive or if we need to swizzle them. + llvm::Type *ScalarTy = cast(VL0)->getValueOperand()->getType(); + // Avoid types that are padded when being allocated as scalars, while + // being packed together in a vector (such as i1). + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); + return TreeEntry::NeedToGather; + } + // Make sure all stores in the bundle are simple - we can't vectorize + // atomic or volatile stores. + for (Value *V : VL) { + auto *SI = cast(V); + if (!SI->isSimple()) { + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); + return TreeEntry::NeedToGather; + } + PointerOps.push_back(SI->getPointerOperand()); + } + + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + std::optional Dist = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); + // Check that the sorted pointer operands are consecutive. + if (static_cast(*Dist) == VL.size() - 1) + return TreeEntry::Vectorize; + } + + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + return TreeEntry::NeedToGather; + } + case Instruction::Call: { + // Check if the calls are all to the same vectorizable intrinsic or + // library function. + CallInst *CI = cast(VL0); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + VFShape Shape = VFShape::get( + *CI, ElementCount::getFixed(static_cast(VL.size())), + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + + if (!VecFunc && !isTriviallyVectorizable(ID)) { + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return TreeEntry::NeedToGather; + } + Function *F = CI->getCalledFunction(); + unsigned NumArgs = CI->arg_size(); + SmallVector ScalarArgs(NumArgs, nullptr); + for (unsigned J = 0; J != NumArgs; ++J) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) + ScalarArgs[J] = CI->getArgOperand(J); + for (Value *V : VL) { + CallInst *CI2 = dyn_cast(V); + if (!CI2 || CI2->getCalledFunction() != F || + getVectorIntrinsicIDForCall(CI2, TLI) != ID || + (VecFunc && + VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || + !CI->hasIdenticalOperandBundleSchema(*CI2)) { + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V + << "\n"); + return TreeEntry::NeedToGather; + } + // Some intrinsics have scalar arguments and should be same in order for + // them to be vectorized. + for (unsigned J = 0; J != NumArgs; ++J) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { + Value *A1J = CI2->getArgOperand(J); + if (ScalarArgs[J] != A1J) { + LLVM_DEBUG(dbgs() + << "SLP: mismatched arguments in call:" << *CI + << " argument " << ScalarArgs[J] << "!=" << A1J << "\n"); + return TreeEntry::NeedToGather; + } + } + } + // Verify that the bundle operands are identical between the two calls. + if (CI->hasOperandBundles() && + !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), + CI->op_begin() + CI->getBundleOperandsEndIndex(), + CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { + LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI + << "!=" << *V << '\n'); + return TreeEntry::NeedToGather; + } + } + + return TreeEntry::Vectorize; + } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!S.isAltShuffle()) { + LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return TreeEntry::NeedToGather; + } + return TreeEntry::Vectorize; + } + default: + LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return TreeEntry::NeedToGather; + } +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -5452,6 +5761,17 @@ if (!TryToFindDuplicates(S)) return; + // Perform specific checks for each particular instruction kind. + OrdersType CurrentOrder; + SmallVector PointerOps; + TreeEntry::EntryState State = getScalarsVectorizationState( + S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); + if (State == TreeEntry::NeedToGather) { + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } + auto &BSRef = BlocksSchedules[BB]; if (!BSRef) BSRef = std::make_unique(BB); @@ -5480,20 +5800,6 @@ case Instruction::PHI: { auto *PH = cast(VL0); - // Check for terminator values (e.g. invoke). - for (Value *V : VL) - for (Value *Incoming : cast(V)->incoming_values()) { - Instruction *Term = dyn_cast(Incoming); - if (Term && Term->isTerminator()) { - LLVM_DEBUG(dbgs() - << "SLP: Need to swizzle PHINodes (terminator use).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); @@ -5521,9 +5827,7 @@ } case Instruction::ExtractValue: case Instruction::ExtractElement: { - OrdersType CurrentOrder; - bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); - if (Reuse) { + if (CurrentOrder.empty()) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -5534,55 +5838,28 @@ VectorizableTree.back()->setOperand(0, Op0); return; } - if (!CurrentOrder.empty()) { - LLVM_DEBUG({ - dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " - "with order"; - for (unsigned Idx : CurrentOrder) - dbgs() << " " << Idx; - dbgs() << "\n"; - }); - fixupOrderingIndices(CurrentOrder); - // Insert new order with initial value 0, if it does not exist, - // otherwise return the iterator to the existing one. - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - // This is a special case, as it does not gather, but at the same time - // we are not extending buildTree_rec() towards the operands. - ValueList Op0; - Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0); - return; - } - LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - BS.cancelScheduling(VL, VL0); + LLVM_DEBUG({ + dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " + "with order"; + for (unsigned Idx : CurrentOrder) + dbgs() << " " << Idx; + dbgs() << "\n"; + }); + fixupOrderingIndices(CurrentOrder); + // Insert new order with initial value 0, if it does not exist, + // otherwise return the iterator to the existing one. + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + // This is a special case, as it does not gather, but at the same time + // we are not extending buildTree_rec() towards the operands. + ValueList Op0; + Op0.assign(VL.size(), VL0->getOperand(0)); + VectorizableTree.back()->setOperand(0, Op0); return; } case Instruction::InsertElement: { assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); - // Check that we have a buildvector and not a shuffle of 2 or more - // different vectors. - ValueSet SourceVectors; - for (Value *V : VL) { - SourceVectors.insert(cast(V)->getOperand(0)); - assert(getInsertIndex(V) != std::nullopt && - "Non-constant or undef index?"); - } - - if (count_if(VL, [&SourceVectors](Value *V) { - return !SourceVectors.contains(V); - }) >= 2) { - // Found 2nd source vector - cancel. - LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " - "different source vectors.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - BS.cancelScheduling(VL, VL0); - return; - } - auto OrdCompare = [](const std::pair &P1, const std::pair &P2) { return P1.first > P2.first; @@ -5625,12 +5902,9 @@ // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - SmallVector PointerOps; - OrdersType CurrentOrder; TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, - CurrentOrder, PointerOps)) { - case LoadsState::Vectorize: + switch (State) { + case TreeEntry::Vectorize: if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, @@ -5645,7 +5919,7 @@ } TE->setOperandsInOrder(); break; - case LoadsState::ScatterVectorize: + case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); @@ -5653,23 +5927,8 @@ buildTree_rec(PointerOps, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); break; - case LoadsState::Gather: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); -#ifndef NDEBUG - Type *ScalarTy = VL0->getType(); - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) - LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); - else if (any_of(VL, [](Value *V) { - return !cast(V)->isSimple(); - })) - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); - else - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); -#endif // NDEBUG - break; + case TreeEntry::NeedToGather: + llvm_unreachable("Unexpected loads state."); } return; } @@ -5685,18 +5944,6 @@ case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); - for (Value *V : VL) { - Type *Ty = cast(V)->getOperand(0)->getType(); - if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() - << "SLP: Gathering casts with different src types.\n"); - return; - } - } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); @@ -5717,19 +5964,6 @@ // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast(VL0)->getPredicate(); CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); - Type *ComparedTy = VL0->getOperand(0)->getType(); - for (Value *V : VL) { - CmpInst *Cmp = cast(V); - if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || - Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() - << "SLP: Gathering cmp with different predicate.\n"); - return; - } - } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -5807,60 +6041,6 @@ return; } case Instruction::GetElementPtr: { - // We don't combine GEPs with complicated (nested) indexing. - for (Value *V : VL) { - auto *I = dyn_cast(V); - if (!I) - continue; - if (I->getNumOperands() != 2) { - LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - - // We can't combine several GEPs into one vector if they operate on - // different types. - Type *Ty0 = cast(VL0)->getSourceElementType(); - for (Value *V : VL) { - auto *GEP = dyn_cast(V); - if (!GEP) - continue; - Type *CurTy = GEP->getSourceElementType(); - if (Ty0 != CurTy) { - LLVM_DEBUG(dbgs() - << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - - // We don't combine GEPs with non-constant indexes. - Type *Ty1 = VL0->getOperand(1)->getType(); - for (Value *V : VL) { - auto *I = dyn_cast(V); - if (!I) - continue; - auto *Op = I->getOperand(1); - if ((!IsScatterVectorizeUserTE && !isa(Op)) || - (Op->getType() != Ty1 && - ((IsScatterVectorizeUserTE && !isa(Op)) || - Op->getType()->getScalarSizeInBits() > - DL->getIndexSizeInBits( - V->getType()->getPointerAddressSpace())))) { - LLVM_DEBUG(dbgs() - << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); @@ -5917,78 +6097,29 @@ } case Instruction::Store: { // Check if the stores are consecutive or if we need to swizzle them. - llvm::Type *ScalarTy = cast(VL0)->getValueOperand()->getType(); - // Avoid types that are padded when being allocated as scalars, while - // being packed together in a vector (such as i1). - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n"); - return; - } - // Make sure all stores in the bundle are simple - we can't vectorize - // atomic or volatile stores. - SmallVector PointerOps(VL.size()); ValueList Operands(VL.size()); - auto POIter = PointerOps.begin(); - auto OIter = Operands.begin(); + auto *OIter = Operands.begin(); for (Value *V : VL) { auto *SI = cast(V); - if (!SI->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); - return; - } - *POIter = SI->getPointerOperand(); *OIter = SI->getValueOperand(); - ++POIter; ++OIter; } - - OrdersType CurrentOrder; - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; - if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - std::optional Dist = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); - // Check that the sorted pointer operands are consecutive. - if (static_cast(*Dist) == VL.size() - 1) { - if (CurrentOrder.empty()) { - // Original stores are consecutive and does not require reordering. - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); - } else { - fixupOrderingIndices(CurrentOrder); - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - } - return; - } + // Check that the sorted pointer operands are consecutive. + if (CurrentOrder.empty()) { + // Original stores are consecutive and does not require reordering. + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + } else { + fixupOrderingIndices(CurrentOrder); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); } - - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } case Instruction::Call: { @@ -5997,68 +6128,6 @@ CallInst *CI = cast(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - VFShape Shape = VFShape::get( - *CI, ElementCount::getFixed(static_cast(VL.size())), - false /*HasGlobalPred*/); - Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - - if (!VecFunc && !isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); - return; - } - Function *F = CI->getCalledFunction(); - unsigned NumArgs = CI->arg_size(); - SmallVector ScalarArgs(NumArgs, nullptr); - for (unsigned j = 0; j != NumArgs; ++j) - if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) - ScalarArgs[j] = CI->getArgOperand(j); - for (Value *V : VL) { - CallInst *CI2 = dyn_cast(V); - if (!CI2 || CI2->getCalledFunction() != F || - getVectorIntrinsicIDForCall(CI2, TLI) != ID || - (VecFunc && - VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || - !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V - << "\n"); - return; - } - // Some intrinsics have scalar arguments and should be same in order for - // them to be vectorized. - for (unsigned j = 0; j != NumArgs; ++j) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) { - Value *A1J = CI2->getArgOperand(j); - if (ScalarArgs[j] != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI - << " argument " << ScalarArgs[j] << "!=" << A1J - << "\n"); - return; - } - } - } - // Verify that the bundle operands are identical between the two calls. - if (CI->hasOperandBundles() && - !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), - CI->op_begin() + CI->getBundleOperandsEndIndex(), - CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *V << '\n'); - return; - } - } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); @@ -6078,15 +6147,6 @@ return; } case Instruction::ShuffleVector: { - // If this is not an alternate sequence of opcode like add-sub - // then do not vectorize this instruction. - if (!S.isAltShuffle()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); - return; - } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); @@ -6144,12 +6204,9 @@ return; } default: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); - return; + break; } + llvm_unreachable("Unexpected vectorization of the instructions."); } unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -340,18 +340,18 @@ define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) -; CHECK-NEXT: ret i16 [[TMP11]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]]) +; CHECK-NEXT: ret i16 [[TMP7]] ; entry: %0 = load i16, ptr %x, align 2 @@ -416,33 +416,33 @@ ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) -; CHECK-NEXT: ret i32 [[TMP29]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) +; CHECK-NEXT: ret i32 [[TMP21]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -677,60 +677,63 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ADD14:%.*]] = or i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 -; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 +; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4 ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[X]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[Y]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> -; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7 -; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 8 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 10 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 -; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], ptr [[ARRAYIDX84]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 +; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4 +; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 9 +; CHECK-NEXT: store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -830,17 +833,17 @@ define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], ptr [[DST0:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> +; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[DST0:%.*]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -921,30 +924,30 @@ ; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4 ; CHECK-NEXT: [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8 ; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP21]] -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP26]], [[TMP29]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[DST0]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[DST4]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[DST8]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP30]], ptr [[DST12]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[DST0]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[DST4]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[DST8]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[DST12]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -1200,86 +1203,86 @@ ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = sub nsw <16 x i32> [[TMP31]], [[TMP39]] -; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP42]], <4 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP50]], <4 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> -; CHECK-NEXT: [[TMP76:%.*]] = lshr <16 x i32> [[TMP75]], -; CHECK-NEXT: [[TMP77:%.*]] = and <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = add <16 x i32> [[TMP78]], [[TMP75]] -; CHECK-NEXT: [[TMP80:%.*]] = xor <16 x i32> [[TMP79]], [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP80]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP81]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP81]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP51]], [[TMP52]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = lshr <16 x i32> [[TMP59]], +; CHECK-NEXT: [[TMP61:%.*]] = and <16 x i32> [[TMP60]], +; CHECK-NEXT: [[TMP62:%.*]] = mul nuw <16 x i32> [[TMP61]], +; CHECK-NEXT: [[TMP63:%.*]] = add <16 x i32> [[TMP62]], [[TMP59]] +; CHECK-NEXT: [[TMP64:%.*]] = xor <16 x i32> [[TMP63]], [[TMP62]] +; CHECK-NEXT: [[TMP65:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP64]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP65]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP65]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -828,10 +828,10 @@ define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( -; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] -; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll @@ -18,11 +18,11 @@ ; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1 ; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[S:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[S:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -60,20 +60,20 @@ ; CHECK-LABEL: @bcast_vals2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A0:%.*]] = load i16, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 ; CHECK-NEXT: [[B0:%.*]] = load i16, ptr [[B:%.*]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load i16, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load i16, ptr [[D:%.*]], align 8 ; CHECK-NEXT: [[E0:%.*]] = load i16, ptr [[E:%.*]], align 8 +; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[S:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP4]] +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[S:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll @@ -4,12 +4,12 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) { ; CHECK-LABEL: define <4 x double> @test ; CHECK-SAME: (ptr [[IA:%.*]], ptr [[IB:%.*]], ptr [[IC:%.*]], ptr [[ID:%.*]], ptr [[IE:%.*]], ptr [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8 -; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8 ; CHECK-NEXT: [[I4238:%.*]] = load double, ptr [[IA]], align 8 ; CHECK-NEXT: [[I4252:%.*]] = load double, ptr [[IB]], align 8 ; CHECK-NEXT: [[I4264:%.*]] = load double, ptr [[IC]], align 8 +; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8 ; CHECK-NEXT: [[I4277:%.*]] = load double, ptr [[IE]], align 8 +; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -93,9 +93,9 @@ define void @zot(ptr %arg) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], ptr [[ARG:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[TMP:%.*]] = load double, ptr undef, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr undef, align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], ptr [[ARG:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -207,10 +207,10 @@ ; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] @@ -285,10 +285,10 @@ ; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll @@ -9,15 +9,15 @@ define void @test(ptr %i1, ptr %i2, ptr %o) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr double, ptr [[I1:%.*]], i64 1 -; CHECK-NEXT: [[I1_0:%.*]] = load double, ptr [[I1]], align 16 +; CHECK-NEXT: [[I1_0:%.*]] = load double, ptr [[I1:%.*]], align 16 +; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr double, ptr [[I1]], i64 1 ; CHECK-NEXT: [[I1_1:%.*]] = load double, ptr [[I1_GEP1]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1_0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1 ; CHECK-NEXT: br i1 undef, label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: -; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds double, ptr [[I2:%.*]], i64 1 -; CHECK-NEXT: [[I2_0:%.*]] = load double, ptr [[I2]], align 16 +; CHECK-NEXT: [[I2_0:%.*]] = load double, ptr [[I2:%.*]], align 16 +; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds double, ptr [[I2]], i64 1 ; CHECK-NEXT: [[I2_1:%.*]] = load double, ptr [[I2_GEP1]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I2_0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -130,16 +130,16 @@ ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 +; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 +; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -147,16 +147,16 @@ ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -164,16 +164,16 @@ ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -254,56 +254,56 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 11 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 +; AVX-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 +; AVX-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 11 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -532,159 +532,159 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1:%.*]], i64 4 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2 -; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3 +; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; SSE-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; SSE-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; SSE-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 +; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 +; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3 ; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] ; SSE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 -; SSE-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP38]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP40]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP42]], i64 3 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP39]], i64 1 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP41]], i64 2 +; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 +; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 +; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 +; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 +; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 +; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2 ; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3 ; SSE-NEXT: [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]] -; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1:%.*]], i64 4 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; AVX-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 +; AVX-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 +; AVX-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 +; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 +; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 +; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 +; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 +; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 +; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 ; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1:%.*]], i64 4 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 +; AVX2-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 ; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX2-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -130,16 +130,16 @@ ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 +; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 +; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -147,16 +147,16 @@ ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -164,16 +164,16 @@ ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 ; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -254,56 +254,56 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 11 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 +; AVX-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 +; AVX-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 11 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] @@ -532,159 +532,159 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1:%.*]], i64 4 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2 -; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3 +; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; SSE-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; SSE-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; SSE-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 +; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 +; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3 ; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] ; SSE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 -; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 -; SSE-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP38]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP40]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP42]], i64 3 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP39]], i64 1 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP41]], i64 2 +; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 +; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 +; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 +; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 +; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 +; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2 ; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3 ; SSE-NEXT: [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]] -; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1:%.*]], i64 4 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; AVX-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 +; AVX-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 +; AVX-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 +; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 +; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 +; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 +; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 +; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 +; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 ; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1:%.*]], i64 4 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14 +; AVX2-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 ; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX2-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll @@ -11,18 +11,18 @@ ; CHECK: while: ; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A1]], align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP0]], [[TMP11]] +; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]] ; CHECK-NEXT: br label [[WHILE]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll @@ -6,10 +6,10 @@ ; Base case without allocas or stacksave define void @basecase(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @basecase( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x ptr>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x ptr>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: store ptr null, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> -; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i32> +; CHECK-NEXT: store <2 x ptr> [[TMP2]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -187,13 +187,13 @@ ; encountered during dependency scanning via the memory chain. define void @stacksave4(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @stacksave4( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x ptr>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x ptr>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i32> ; CHECK-NEXT: [[STACK:%.*]] = call ptr @llvm.stacksave() ; CHECK-NEXT: [[X:%.*]] = alloca inalloca i8, align 1 ; CHECK-NEXT: call void @use(ptr inalloca(i8) [[X]]) #[[ATTR4]] ; CHECK-NEXT: call void @llvm.stackrestore(ptr [[STACK]]) -; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8 +; CHECK-NEXT: store <2 x ptr> [[TMP2]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -217,13 +217,13 @@ define void @stacksave5(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @stacksave5( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x ptr>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x ptr>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i32> ; CHECK-NEXT: [[STACK:%.*]] = call ptr @llvm.stacksave() ; CHECK-NEXT: [[X:%.*]] = alloca inalloca i8, align 1 ; CHECK-NEXT: call void @use(ptr inalloca(i8) [[X]]) #[[ATTR4]] ; CHECK-NEXT: call void @llvm.stackrestore(ptr [[STACK]]) -; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8 +; CHECK-NEXT: store <2 x ptr> [[TMP2]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -292,23 +292,23 @@ ; CHECK-LABEL: @ham( ; CHECK-NEXT: [[VAR2:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[VAR3:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[VAR4:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[VAR5:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[VAR12:%.*]] = alloca [12 x ptr], align 8 ; CHECK-NEXT: [[VAR15:%.*]] = call ptr @wibble(ptr [[VAR2]]) ; CHECK-NEXT: [[VAR16:%.*]] = call ptr @wibble(ptr [[VAR3]]) -; CHECK-NEXT: [[VAR36:%.*]] = getelementptr inbounds [12 x ptr], ptr [[VAR12]], i32 0, i32 4 -; CHECK-NEXT: [[VAR4:%.*]] = alloca i8, align 1 -; CHECK-NEXT: [[VAR5:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[VAR17:%.*]] = call ptr @wibble(ptr [[VAR4]]) ; CHECK-NEXT: [[VAR23:%.*]] = call ptr @llvm.stacksave() ; CHECK-NEXT: [[VAR24:%.*]] = alloca inalloca i32, align 4 ; CHECK-NEXT: call void @quux(ptr inalloca(i32) [[VAR24]]) ; CHECK-NEXT: call void @llvm.stackrestore(ptr [[VAR23]]) -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> poison, ptr [[VAR4]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: store <4 x ptr> [[SHUFFLE]], ptr [[VAR12]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> [[TMP2]], ptr [[VAR5]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> -; CHECK-NEXT: store <4 x ptr> [[SHUFFLE1]], ptr [[VAR36]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[VAR4]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x ptr> [[TMP2]], ptr [[VAR12]], align 8 +; CHECK-NEXT: [[VAR36:%.*]] = getelementptr inbounds [12 x ptr], ptr [[VAR12]], i32 0, i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[VAR5]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> +; CHECK-NEXT: store <4 x ptr> [[TMP4]], ptr [[VAR36]], align 8 ; CHECK-NEXT: ret void ; %var2 = alloca i8 @@ -343,14 +343,14 @@ define void @spam() #1 { ; CHECK-LABEL: @spam( -; CHECK-NEXT: [[VAR12:%.*]] = alloca [12 x ptr], align 8 -; CHECK-NEXT: [[VAR36:%.*]] = getelementptr inbounds [12 x ptr], ptr [[VAR12]], i32 0, i32 4 ; CHECK-NEXT: [[VAR4:%.*]] = alloca i8, align 1 ; CHECK-NEXT: [[VAR5:%.*]] = alloca i8, align 1 +; CHECK-NEXT: [[VAR12:%.*]] = alloca [12 x ptr], align 8 +; CHECK-NEXT: [[VAR36:%.*]] = getelementptr inbounds [12 x ptr], ptr [[VAR12]], i32 0, i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[VAR4]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[VAR5]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> -; CHECK-NEXT: store <4 x ptr> [[SHUFFLE]], ptr [[VAR36]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> +; CHECK-NEXT: store <4 x ptr> [[TMP3]], ptr [[VAR36]], align 8 ; CHECK-NEXT: ret void ; %var4 = alloca i8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -10,12 +10,12 @@ ; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, ptr [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 3 ; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, ptr [[GEP_3]], align 4 -; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, ptr [[INN:%.*]], i64 1 +; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, ptr [[INN:%.*]], align 4 +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, ptr [[INN]], i64 1 +; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, ptr [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, ptr [[INN]], i64 2 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, ptr [[INN]], align 4 ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, ptr [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, ptr [[INN]], i64 3 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, ptr [[GEP_4]], align 4 ; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, ptr [[GEP_6]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -11,16 +11,16 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 ; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 ; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 ; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: store <2 x double> [[TMP7]], ptr [[SARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -58,16 +58,16 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 ; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 ; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 ; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: store <2 x double> [[TMP7]], ptr [[SARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]] +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -178,16 +178,16 @@ ; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8 ; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 ; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] ; ENABLED-NEXT: [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: store <2 x double> [[TMP7]], ptr [[SARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[DST_ADDR_014:%.*]] = phi ptr [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_013:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[SRC_ADDR_013]], align 8 -; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[DST_ADDR_014]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[SRC_ADDR_013]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[DST_ADDR_014]], align 8 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds double, ptr [[SRC_ADDR_013]], i64 [[I_015]] ; CHECK-NEXT: [[ADD_PTR4]] = getelementptr inbounds double, ptr [[DST_ADDR_014]], i64 [[I_015]] ; CHECK-NEXT: [[INC]] = add i64 [[I_015]], 1 @@ -53,8 +53,8 @@ ; CHECK-NEXT: [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[DST_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_021:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC_ADDR_021]], align 4 -; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC_ADDR_021]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 @@ -153,16 +153,16 @@ ; CHECK-NEXT: [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[DST_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_021:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_021]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 4 ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 @@ -205,9 +205,9 @@ define void @store_splat(ptr, float) { ; CHECK-LABEL: @store_splat( -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP1:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: store <4 x float> [[SHUFFLE]], ptr [[TMP0:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP1:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP0:%.*]], align 4 ; CHECK-NEXT: ret void ; store float %1, ptr %0, align 4 @@ -284,8 +284,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 undef to i16 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP2]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: store <8 x i16> [[TMP5]], ptr [[A:%.*]], align 16 ; CHECK-NEXT: ret void ; %1 = load i16, ptr %v1, align 4