diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2267,13 +2267,14 @@ /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); - /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef VL); + /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry + /// \p E. + Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. - Value *createBuildVector(ArrayRef VL); + Value *createBuildVector(const TreeEntry *E); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. If \p @@ -2376,6 +2377,12 @@ return IsSame(Scalars, ReuseShuffleIndices); } + bool isOperandGatherNode(const EdgeInfo &UserEI) const { + return State == TreeEntry::NeedToGather && + UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && + UserTreeIndices.front().UserTE == UserEI.UserTE; + } + /// \returns true if current entry has same operands as \p TE. bool hasEqualOperands(const TreeEntry &TE) const { if (TE.getNumOperands() != getNumOperands()) @@ -3910,17 +3917,22 @@ } void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const { - // For vectorized and non-clustered reused - just reorder reuses mask. + // Reorder reuses mask. + reorderReuses(TE.ReuseShuffleIndices, Mask); const unsigned Sz = TE.Scalars.size(); - if (TE.State != TreeEntry::NeedToGather || !TE.ReorderIndices.empty() || + // For vectorized and non-clustered reused no need to do anything else. + if (TE.State != TreeEntry::NeedToGather || !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, Sz) || - !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) { - reorderReuses(TE.ReuseShuffleIndices, Mask); + !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) return; - } + SmallVector NewMask; + inversePermutation(TE.ReorderIndices, NewMask); + addMask(NewMask, TE.ReuseShuffleIndices); + // Clear reorder since it is going to be applied to the new mask. + TE.ReorderIndices.clear(); // Try to improve gathered nodes with clustered reuses, if possible. - reorderScalars(TE.Scalars, makeArrayRef(TE.ReuseShuffleIndices).slice(0, Sz)); + reorderScalars(TE.Scalars, makeArrayRef(NewMask).slice(0, Sz)); // Fill the reuses mask with the identity submasks. for (auto *It = TE.ReuseShuffleIndices.begin(), *End = TE.ReuseShuffleIndices.end(); @@ -8036,7 +8048,8 @@ }; } // namespace -Value *BoUpSLP::vectorizeTree(ArrayRef VL) { +Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { + ArrayRef VL = E->getOperand(NodeIdx); const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. @@ -8047,123 +8060,177 @@ S = getSameOpcode(*It, *TLI); } if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VF != cast(V->getType())->getNumElements()) { - if (!E->ReuseShuffleIndices.empty()) { - // Reshuffle to get only unique values. - // If some of the scalars are duplicated in the vectorization tree - // entry, we do not vectorize them but instead generate a mask for - // the reuses. But if there are several users of the same entry, - // they may have different vectorization factors. This is especially - // important for PHI nodes. In this case, we need to adapt the - // resulting instruction for the user vectorization factor and have - // to reshuffle it again to take only unique elements of the vector. - // Without this code the function incorrectly returns reduced vector - // instruction with the same elements, not with the unique ones. - - // block: - // %phi = phi <2 x > { .., %entry} {%shuffle, %block} - // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> - // ... (use %2) - // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} - // br %block - SmallVector UniqueIdxs(VF, UndefMaskElem); - SmallSet UsedIdxs; - int Pos = 0; - int Sz = VL.size(); - for (int Idx : E->ReuseShuffleIndices) { - if (Idx != Sz && Idx != UndefMaskElem && - UsedIdxs.insert(Idx).second) - UniqueIdxs[Idx] = Pos; - ++Pos; - } - assert(VF >= UsedIdxs.size() && "Expected vectorization factor " - "less than original vector size."); - UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); - V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); - } else { - assert(VF < cast(V->getType())->getNumElements() && - "Expected vectorization factor less " - "than original vector size."); - SmallVector UniformMask(VF, 0); - std::iota(UniformMask.begin(), UniformMask.end(), 0); - V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); - } - if (auto *I = dyn_cast(V)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (TreeEntry *VE = getTreeEntry(S.OpValue); VE && VE->isSame(VL)) { + assert((any_of(VE->UserTreeIndices, + [E, NodeIdx](const EdgeInfo &EI) { + return EI.EdgeIdx == NodeIdx && EI.UserTE == E; + }) || + any_of(VectorizableTree, + [E, NodeIdx, VE](const std::unique_ptr &TE) { + return TE->isOperandGatherNode({E, NodeIdx}) && + VE->isSame(TE->Scalars); + })) && + "Expected same vectorizable node."); + Value *V = vectorizeTree(VE); + if (VF != cast(V->getType())->getNumElements()) { + if (!VE->ReuseShuffleIndices.empty()) { + // Reshuffle to get only unique values. + // If some of the scalars are duplicated in the vectorization + // tree entry, we do not vectorize them but instead generate a + // mask for the reuses. But if there are several users of the + // same entry, they may have different vectorization factors. + // This is especially important for PHI nodes. In this case, we + // need to adapt the resulting instruction for the user + // vectorization factor and have to reshuffle it again to take + // only unique elements of the vector. Without this code the + // function incorrectly returns reduced vector instruction with + // the same elements, not with the unique ones. + + // block: + // %phi = phi <2 x > { .., %entry} {%shuffle, %block} + // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> + // ... (use %2) + // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} + // br %block + SmallVector UniqueIdxs(VF, UndefMaskElem); + SmallSet UsedIdxs; + int Pos = 0; + for (int Idx : VE->ReuseShuffleIndices) { + if (Idx != static_cast(VF) && Idx != UndefMaskElem && + UsedIdxs.insert(Idx).second) + UniqueIdxs[Idx] = Pos; + ++Pos; } + assert(VF >= UsedIdxs.size() && "Expected vectorization factor " + "less than original vector size."); + UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); + V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); + } else { + assert(VF < cast(V->getType())->getNumElements() && + "Expected vectorization factor less " + "than original vector size."); + SmallVector UniformMask(VF, 0); + std::iota(UniformMask.begin(), UniformMask.end(), 0); + V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); + } + if (auto *I = dyn_cast(V)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); } - return V; } + return V; + } } - // Can't vectorize this, so simply build a new vector with each lane - // corresponding to the requested value. - return createBuildVector(VL); + // Find the corresponding gather entry and vectorize it. + // Allows to be more accurate with tree/graph transformations, checks for the + // correctness of the transformations in many cases. + auto *I = find_if(VectorizableTree, + [E, NodeIdx](const std::unique_ptr &TE) { + return TE->isOperandGatherNode({E, NodeIdx}); + }); + assert(I != VectorizableTree.end() && "Gather node is not in the graph."); + assert(I->get()->UserTreeIndices.size() == 1 && + "Expected only single user for the gather node."); + assert(I->get()->isSame(VL) && "Expected same list of scalars."); + return vectorizeTree(I->get()); } -Value *BoUpSLP::createBuildVector(ArrayRef VL) { - assert(any_of(VectorizableTree, - [VL](const std::unique_ptr &TE) { - return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); - }) && - "Non-matching gather node."); - unsigned VF = VL.size(); - // Exploit possible reuse of values across lanes. - SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - if (VL.size() > 2) { + +Value *BoUpSLP::createBuildVector(const TreeEntry *E) { + assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); + unsigned VF = E->getVectorFactor(); + + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleExtractSeq, + CSEBlocks); + SmallVector Gathered( + VF, PoisonValue::get(E->Scalars.front()->getType())); + bool NeedFreeze = false; + SmallVector VL(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the redorder indices and reorder scalars per this mask. + SmallVector ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(VL, ReorderMask); + if (!allConstant(VL)) { + // For splats with can emit broadcasts instead of gathers, so try to find + // such sequences. + bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back()); + SmallVector ReuseMask(VF, UndefMaskElem); + SmallVector UndefPos; DenseMap UniquePositions; - unsigned NumValues = - std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) { - return !isa(V); - }).base()); - VF = std::max(VF, PowerOf2Ceil(NumValues)); - int UniqueVals = 0; - for (Value *V : VL.drop_back(VL.size() - VF)) { + // Gather unique non-const values and all constant values. + // For repeated values, just shuffle them. + for (auto [I, V] : enumerate(VL)) { if (isa(V)) { - ReuseShuffleIndicies.emplace_back(UndefMaskElem); + if (!isa(V)) { + Gathered[I] = V; + ReuseMask[I] = I; + UndefPos.push_back(I); + } continue; } if (isConstant(V)) { - ReuseShuffleIndicies.emplace_back(UniqueValues.size()); - UniqueValues.emplace_back(V); + Gathered[I] = V; + ReuseMask[I] = I; continue; } - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) { - UniqueValues.emplace_back(V); - ++UniqueVals; - } - } - if (UniqueVals == 1 && UniqueValues.size() == 1) { - // Emit pure splat vector. - ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), - UndefMaskElem); - } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { - if (UniqueValues.empty()) { - assert(all_of(VL, UndefValue::classof) && "Expected list of undefs."); - NumValues = VF; + if (IsSplat) { + Gathered.front() = V; + ReuseMask[I] = 0; + } else { + const auto Res = UniquePositions.try_emplace(V, I); + Gathered[Res.first->second] = V; + ReuseMask[I] = Res.first->second; + } + } + if (!UndefPos.empty() && IsSplat) { + // For undef values, try to replace them with the simple broadcast. + // We can do it if the broadcasted value is guaranteed to be + // non-poisonous, or by freezing the incoming scalar value first. + auto *It = find_if(Gathered, [this, E](Value *V) { + return !isa(V) && + (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || + any_of(V->uses(), [E](const Use &U) { + // Check if the value already used in the same operation in + // one of the nodes already. + return E->UserTreeIndices.size() == 1 && + is_contained( + E->UserTreeIndices.front().UserTE->Scalars, + U.getUser()) && + E->UserTreeIndices.front().EdgeIdx != U.getOperandNo(); + })); + }); + if (It != Gathered.end()) { + // Replace undefs by the non-poisoned scalars and emit broadcast. + int Pos = std::distance(Gathered.begin(), It); + for_each(UndefPos, [&](int I) { + // Set the undef position to the non-poisoned scalar. + ReuseMask[I] = Pos; + // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already. + if (I != Pos) + Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + }); + } else { + // Replace undefs by the poisons, emit broadcast and then emit + // freeze. + for_each(UndefPos, [&](int I) { + ReuseMask[I] = UndefMaskElem; + if (isa(Gathered[I])) + Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + }); + NeedFreeze = true; } - ReuseShuffleIndicies.clear(); - UniqueValues.clear(); - UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); } - UniqueValues.append(VF - UniqueValues.size(), - PoisonValue::get(VL[0]->getType())); - VL = UniqueValues; - } - - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleExtractSeq, - CSEBlocks); - Value *Vec = gather(VL); - if (!ReuseShuffleIndicies.empty()) { - ShuffleBuilder.addMask(ReuseShuffleIndicies); - Vec = ShuffleBuilder.finalize(Vec); - } + ShuffleBuilder.addMask(ReuseMask); + } else { + copy(VL, Gathered.begin()); + } + // Gather unique scalars and all constants. + Value *Vec = gather(Gathered); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + Vec = ShuffleBuilder.finalize(Vec); + if (NeedFreeze) + Vec = Builder.CreateFreeze(Vec); return Vec; } @@ -8180,6 +8247,13 @@ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleExtractSeq, CSEBlocks); if (E->State == TreeEntry::NeedToGather) { + if (E->Idx > 0) { + // We are in the middle of a vectorizable chain. We need to gather the + // scalars from the users. + Value *Vec = createBuildVector(E); + E->VectorizedValue = Vec; + return Vec; + } if (E->getMainOp()) setInsertPointAfterBundle(E); Value *Vec; @@ -8256,7 +8330,7 @@ Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(E->getOperand(i)); + Value *Vec = vectorizeOperand(E, i); NewPhi->addIncoming(Vec, IBB); } @@ -8290,7 +8364,7 @@ case Instruction::InsertElement: { assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); Builder.SetInsertPoint(cast(E->Scalars.back())); - Value *V = vectorizeTree(E->getOperand(1)); + Value *V = vectorizeOperand(E, 1); // Create InsertVector shuffle if necessary auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { @@ -8396,7 +8470,7 @@ case Instruction::BitCast: { setInsertPointAfterBundle(E); - Value *InVec = vectorizeTree(E->getOperand(0)); + Value *InVec = vectorizeOperand(E, 0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -8417,8 +8491,8 @@ case Instruction::ICmp: { setInsertPointAfterBundle(E); - Value *L = vectorizeTree(E->getOperand(0)); - Value *R = vectorizeTree(E->getOperand(1)); + Value *L = vectorizeOperand(E, 0); + Value *R = vectorizeOperand(E, 1); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -8439,9 +8513,9 @@ case Instruction::Select: { setInsertPointAfterBundle(E); - Value *Cond = vectorizeTree(E->getOperand(0)); - Value *True = vectorizeTree(E->getOperand(1)); - Value *False = vectorizeTree(E->getOperand(2)); + Value *Cond = vectorizeOperand(E, 0); + Value *True = vectorizeOperand(E, 1); + Value *False = vectorizeOperand(E, 2); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -8460,7 +8534,7 @@ case Instruction::FNeg: { setInsertPointAfterBundle(E); - Value *Op = vectorizeTree(E->getOperand(0)); + Value *Op = vectorizeOperand(E, 0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -8502,8 +8576,8 @@ case Instruction::Xor: { setInsertPointAfterBundle(E); - Value *LHS = vectorizeTree(E->getOperand(0)); - Value *RHS = vectorizeTree(E->getOperand(1)); + Value *LHS = vectorizeOperand(E, 0); + Value *RHS = vectorizeOperand(E, 1); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -8550,7 +8624,7 @@ } } else { assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); - Value *VecPtr = vectorizeTree(E->getOperand(0)); + Value *VecPtr = vectorizeOperand(E, 0); // Use the minimum alignment of the gathered loads. Align CommonAlignment = LI->getAlign(); for (Value *V : E->Scalars) @@ -8573,7 +8647,7 @@ setInsertPointAfterBundle(E); - Value *VecValue = vectorizeTree(E->getOperand(0)); + Value *VecValue = vectorizeOperand(E, 0); ShuffleBuilder.addMask(E->ReorderIndices); VecValue = ShuffleBuilder.finalize(VecValue); @@ -8604,11 +8678,11 @@ auto *GEP0 = cast(VL0); setInsertPointAfterBundle(E); - Value *Op0 = vectorizeTree(E->getOperand(0)); + Value *Op0 = vectorizeOperand(E, 0); SmallVector OpVecs; for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { - Value *OpVec = vectorizeTree(E->getOperand(J)); + Value *OpVec = vectorizeOperand(E, J); OpVecs.push_back(OpVec); } @@ -8662,7 +8736,7 @@ continue; } - Value *OpVec = vectorizeTree(E->getOperand(j)); + Value *OpVec = vectorizeOperand(E, j); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) @@ -8717,11 +8791,11 @@ Value *LHS = nullptr, *RHS = nullptr; if (Instruction::isBinaryOp(E->getOpcode()) || isa(VL0)) { setInsertPointAfterBundle(E); - LHS = vectorizeTree(E->getOperand(0)); - RHS = vectorizeTree(E->getOperand(1)); + LHS = vectorizeOperand(E, 0); + RHS = vectorizeOperand(E, 1); } else { setInsertPointAfterBundle(E); - LHS = vectorizeTree(E->getOperand(0)); + LHS = vectorizeOperand(E, 0); } if (E->VectorizedValue) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll @@ -25,42 +25,42 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>* +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[SHUFFLE1]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[OUT:%.*]] to <2 x double>* ; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[TMP4]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[TMP16]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP9]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <2 x double> [[TMP15]], [[TMP18]] -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[TMP20]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x double>, <2 x double>* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[SHUFFLE1]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[TMP13]], [[TMP16]] +; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8 ; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[TEMP10]], i32 1 -; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP2]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x double> [[TMP24]], double [[TEMP11]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP7]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP23]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP27]], <2 x double>* [[TMP28]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 +; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP2]], [[SHUFFLE4]] +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 +; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP6]], [[SHUFFLE5]] +; CHECK-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[TMP20]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP23]], <2 x double>* [[TMP24]], align 8 ; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6 -; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[TMP14]], [[TMP22]] -; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[TMP17]], [[TMP25]] -; CHECK-NEXT: [[TMP31:%.*]] = fadd <2 x double> [[TMP29]], [[TMP30]] -; CHECK-NEXT: [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP31]], <2 x double>* [[TMP32]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = fmul <2 x double> [[TMP12]], [[SHUFFLE4]] +; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP15]], [[SHUFFLE5]] +; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP27]], <2 x double>* [[TMP28]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -10,18 +10,18 @@ ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -50,18 +50,18 @@ ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub reassoc <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd reassoc <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -91,18 +91,18 @@ ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -132,18 +132,18 @@ ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -173,18 +173,18 @@ ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul nnan <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul nnan <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub nnan <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd nnan <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul nnan <2 x float> [[SHUFFLE1]], [[TMP1]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul nnan <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub nnan <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd nnan <2 x float> [[TMP5]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -267,16 +267,16 @@ ; CHECK-NEXT: [[SUB_I1096:%.*]] = fsub fast float 1.000000e+00, [[TMP0:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[SUB_I1096]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x float> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x float> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[SHUFFLE1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x float> [[SHUFFLE1]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll @@ -16,14 +16,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -63,14 +63,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[GEP_1_0]] to <2 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[GEP_1_0]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP8]], <2 x float>* [[TMP9]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[LD_2_1]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[GEP_1_0]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -110,14 +110,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[GEP_1_0]] to <2 x i64>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i64> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[GEP_1_0]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* [[TMP9]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP1]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[GEP_1_0]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -157,14 +157,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[GEP_1_0]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LD_2_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[LD_2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i32> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[GEP_1_0]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP8]], <2 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP1]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[GEP_1_0]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -19,23 +19,22 @@ ; CHECK-LABEL: @s116_modified( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 4 -; CHECK-NEXT: [[LD1:%.*]] = load float, float* [[GEP1]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP2]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[LD4:%.*]] = load float, float* [[GEP4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[LD4]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[LD1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[LD1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[GEP0]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[GEP3]] to <2 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[TMP9]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds float, float* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -518,10 +518,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[X]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[PTR]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 2 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -585,10 +585,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[X]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[PTR]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 2 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[PTR]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -60,11 +60,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V3_LANE_1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V3_LANE_1]]) -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -102,12 +102,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP4]], <4 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -139,14 +139,14 @@ ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: call void @use(double [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP5]]) -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -182,12 +182,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_1]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_2]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) -; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <4 x double> [[TMP4]], <4 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -288,15 +288,15 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_3]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[V2_LANE_2]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[V2_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[V2_LANE_2]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[V2_LANE_0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[V2_LANE_0]], i32 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP9]], <9 x double>* [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP8]], <9 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -115,10 +115,10 @@ ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 ; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 -; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[SCALAR]], i32 1 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* -; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2 +; GCN-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer +; GCN-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]] +; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* +; GCN-NEXT: store <2 x half> [[TMP4]], <2 x half> addrspace(3)* [[TMP5]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, half addrspace(3)* %a, align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll @@ -9,17 +9,17 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = fptosi <2 x double> [[TMP6]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = sext <2 x i32> [[TMP7]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i32 0 +; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP10]], 1 ; CHECK-NEXT: ret { i64, i64 } [[T17]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -6,17 +6,17 @@ ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP14:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE7]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE8]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] @@ -25,10 +25,10 @@ ; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP14]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> +; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP9]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP9]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll @@ -5,9 +5,9 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = load i16, i16* undef, align 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> , i16 [[CALL37:%.*]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i16> [[SHUFFLE]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <8 x i16> [[SHUFFLE]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast_long.ll @@ -19,8 +19,9 @@ ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[A0]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IDXS0]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = freeze <8 x i32> [[SHUFFLE]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[IDXS0]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP1]], <8 x i32>* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll @@ -46,11 +46,11 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[A]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[SHUFFLE]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -71,19 +71,19 @@ ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 ; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 -; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 -; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = fdiv <2 x double> [[TMP30]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[TMP32]], i32 1 +; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP33]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP32]], i32 0 +; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 -; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] +; CHECK-NEXT: [[TMP35:%.*]] = fcmp ule <2 x double> [[TMP32]], +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP35]], i32 1 +; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP37]], i1 true, i1 [[TMP36]] ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll @@ -14,10 +14,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp une <2 x double> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[G:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[G]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x double> [[TMP4]], <2 x double> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP2]], <2 x double> [[SHUFFLE]], <2 x double> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -97,9 +97,9 @@ ; AVX-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] ; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1 ; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 3 -; AVX-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], [[TMP6]] -; AVX-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 +; AVX-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[SHUFFLE2]] +; AVX-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 ; AVX-NEXT: ret void ; %add1 = add i32 %c, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll @@ -11,20 +11,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[TMP7]], [[TMP8]] ; CHECK-NEXT: br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0)) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -12,32 +12,32 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> -; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], -; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> -; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], -; CHECK-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP0]], [[SHUFFLE]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = fcmp olt <2 x float> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP5]], <2 x float> +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP8]], <2 x float> , <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[ADD13]] = fadd float [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[ADD13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = fcmp olt <2 x float> [[TMP14]], +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x float> [[TMP14]], <2 x float> +; CHECK-NEXT: [[TMP17:%.*]] = fcmp olt <2 x float> [[TMP16]], +; CHECK-NEXT: [[TMP18]] = select <2 x i1> [[TMP17]], <2 x float> , <2 x float> [[TMP16]] ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -5,19 +5,19 @@ ; CHECK-LABEL: @exceed( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <2 x double> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP5]], undef ; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP5]], undef ; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef @@ -27,16 +27,16 @@ ; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP6]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x double> [[TMP12]], undef ; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB2:%.*]] ; CHECK-NEXT: ] @@ -45,7 +45,7 @@ ; CHECK: bb2: ; CHECK-NEXT: br label [[LABEL]] ; CHECK: label: -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15]], [[BB2]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x double> [ [[TMP10]], [[BB1]] ], [ [[TMP13]], [[BB2]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -246,22 +246,22 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[A]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[N]], 4 ; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[N]], 4 ; CHECK-NEXT: [[CONV12:%.*]] = sitofp i32 [[ADD]] to double -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: ; CHECK-NEXT: ret i32 0 @@ -352,18 +352,18 @@ ; CHECK-LABEL: @cse_for_hoisted_instructions_in_preheader( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> , [[SHUFFLE]] ; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[GEP_0]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[GEP_0]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 10 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_2]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[GEP_2]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -6,11 +6,10 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 undef, i32 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> , [[SHUFFLE]] -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> , [[SHUFFLE]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 undef, i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> , [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -11,12 +11,12 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64*> [[TMP1]], <2 x i64*> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, <2 x i64*> [[SHUFFLE]], <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <2 x i64*> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64*> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP5]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: @@ -94,15 +94,15 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* [[TMP9]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64*> [[TMP1]], <2 x i64*> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, <2 x i64*> [[SHUFFLE]], <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <2 x i64*> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64*> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multiple-uses.ll @@ -16,11 +16,11 @@ ; CHECK-LABEL: @multi_uses( ; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x float> [[Y:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Y1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Y1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X:%.*]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; CHECK-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -84,21 +84,21 @@ ; THRESH1-LABEL: @f_used_twice_in_tree( ; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 ; THRESH1-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH1-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH1-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH1-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH1-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH1-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; THRESH1-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE]], [[X]] +; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; THRESH1-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP4]], [[TMP5]] ; THRESH1-NEXT: ret float [[ADD]] ; ; THRESH2-LABEL: @f_used_twice_in_tree( ; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 ; THRESH2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESH2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; THRESH2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]] -; THRESH2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; THRESH2-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]] +; THRESH2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; THRESH2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE]], [[X]] +; THRESH2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; THRESH2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; THRESH2-NEXT: [[ADD:%.*]] = fadd float [[TMP4]], [[TMP5]] ; THRESH2-NEXT: ret float [[ADD]] ; %x0 = extractelement <2 x float> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -769,11 +769,11 @@ ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) ; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 ; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP3]], [[TMP5]] -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer +; THRESHOLD-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP3]], [[SHUFFLE]] +; THRESHOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]] ; THRESHOLD-NEXT: ret float [[OP_RDX2]] ; entry: @@ -897,11 +897,11 @@ ; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 ; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONVC]], i32 1 ; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[TMP4]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> zeroinitializer +; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], [[SHUFFLE]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP7]], [[TMP8]] ; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00 ; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll @@ -12,21 +12,21 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[INTREEUSER:%.*]] = fadd double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[INTREEUSER:%.*]] = fadd double [[TMP7]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[TMP7]], [[TMP8]] ; CHECK-NEXT: br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0)) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -12,10 +12,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], undef ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], undef ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -9,10 +9,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SHUFFLE]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -444,14 +444,14 @@ ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 ; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[LOADVEC]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[LOADVEC2]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 @@ -669,16 +669,16 @@ ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[EXTRA1]], i32 1 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[LOADA0]], i32 1 -; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[LOADA1]], i32 1 -; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] -; AVX-NEXT: [[TMP12:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; AVX-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP2]], [[SHUFFLE]] +; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[EXTRB1]], i32 1 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 +; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP6]], [[SHUFFLE1]] +; AVX-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]] +; AVX-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; AVX-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; AVX-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 @@ -739,15 +739,15 @@ ; AVX-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP9]], [[TMP10]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 +; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE1]] +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]] ; AVX-NEXT: ret double [[ADD3]] ; entry: @@ -789,11 +789,11 @@ ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] ; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer -; SSE-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP6]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]] +; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP6]], [[SHUFFLE1]] +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] ; SSE-NEXT: ret double [[RES]] ; ; AVX-LABEL: @splat_loads_with_internal_uses( @@ -806,16 +806,16 @@ ; AVX-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fsub <2 x double> [[TMP8]], [[TMP3]] -; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 -; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP10]], [[TMP11]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 +; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE1]] +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] +; AVX-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP6]], [[SHUFFLE]] +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] ; AVX-NEXT: ret double [[RES]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -10,18 +10,18 @@ ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[SUB102_1]], i32 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 5 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> , i32 [[SUB86_1]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 12 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll @@ -17,21 +17,21 @@ ; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP3]], i1 [[ICMP_A1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[SHUFFLE]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]] ; CHECK-NEXT: br label [[WHILE_END]] ; CHECK: while.end: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5]], [[WHILE_BODY_LR_PH]] ] -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @c to <2 x i64>*), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[ICMP_D0:%.*]] = icmp eq i64 [[TMP8]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4]], [[WHILE_BODY_LR_PH]] ] +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @c to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[ICMP_D0:%.*]] = icmp eq i64 [[TMP7]], 0 ; CHECK-NEXT: br i1 [[ICMP_D0]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[AND0_TMP:%.*]] = and i64 [[TMP8]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[AND0_TMP]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (%struct.a* @a to <2 x i64>*), align 8 +; CHECK-NEXT: [[AND0_TMP:%.*]] = and i64 [[TMP7]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[AND0_TMP]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], [[TMP6]] +; CHECK-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (%struct.a* @a to <2 x i64>*), align 8 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll @@ -17,23 +17,24 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP0]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], undef -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> undef -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[TMP5]] to <4 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP16]] ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: ret i32 [[TMP6]] @@ -51,7 +51,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: ret i32 [[TMP6]] @@ -88,7 +88,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: ret i32 [[TMP6]] @@ -126,7 +126,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: ret i32 [[TMP6]] @@ -164,7 +164,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: ret i32 [[TMP6]] @@ -201,7 +201,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG2:%.*]], i32 3 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: ret i32 [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -94,19 +94,19 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP7]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = fcmp ule <2 x double> [[TMP6]], +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 +; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP10]], [[TMP11]] ; CHECK-NEXT: ret i1 [[NOT_OR_COND9]] ; CHECK: cleanup: ; CHECK-NEXT: ret i1 false @@ -143,12 +143,12 @@ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fcmp uge <2 x double> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND:%.*]] = or i1 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP5]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp uge <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: [[NOT_OR_COND:%.*]] = or i1 [[TMP9]], [[TMP10]] ; CHECK-NEXT: ret i1 [[NOT_OR_COND]] ; %fneg = fneg double %b diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: br label [[T:%.*]] ; CHECK: t: ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[LD]], [[SHUFFLE]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; CHECK-NEXT: store <8 x i16> [[TMP0]], <8 x i16>* [[TMP1]], align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 256, 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4 @@ -19,23 +19,23 @@ ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP3]] to <2 x float>* ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP9]], [[TMP14]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = fsub <2 x float> [[TMP12]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x float> [[TMP12]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> -; CHECK-NEXT: [[TMP19]] = fadd <2 x float> [[TMP2]], [[TMP18]] -; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]] -; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP9]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP9]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[TMP11]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x i32> +; CHECK-NEXT: [[TMP17]] = fadd <2 x float> [[TMP2]], [[TMP16]] +; CHECK-NEXT: [[TMP18]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP19]], <2 x float>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP17]], <2 x float>* [[TMP21]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll @@ -13,18 +13,18 @@ ; CHECK-NEXT: bb1: ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], ; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PTRA1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRA1]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]] ; CHECK-NEXT: ret void ; bb1: @@ -117,22 +117,22 @@ ; CHECK-NEXT: bb1: ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], ; CHECK-NEXT: [[PTRA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[PTRA0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[PTRA0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[SHUFFLE1]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] ; CHECK-NEXT: ret void ; bb1: @@ -167,21 +167,21 @@ ; CHECK-NEXT: bb1: ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], ; CHECK-NEXT: [[PTRA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[PTRA0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[PTRA0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP8]] ; CHECK-NEXT: ret void ; bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-undefs.ll @@ -6,12 +6,12 @@ ; CHECK-NEXT: for.cond.preheader: ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_INC_PREHEADER:%.*]] ; CHECK: for.inc.preheader: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[TMP0:%.*]], i32 6 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[TMP0:%.*]], i32 6 ; CHECK-NEXT: br i1 false, label [[FOR_END]], label [[L1_PREHEADER:%.*]] ; CHECK: for.end: ; CHECK-NEXT: [[DOTPR:%.*]] = phi i32 [ 0, [[FOR_INC_PREHEADER]] ], [ 0, [[FOR_COND_PREHEADER:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[DOTPR]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[L1_PREHEADER]] ; CHECK: L1.preheader: ; CHECK-NEXT: [[TMP3:%.*]] = phi <8 x i32> [ [[SHUFFLE]], [[FOR_END]] ], [ [[TMP1]], [[FOR_INC_PREHEADER]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -13,16 +13,16 @@ ; CHECK: else: ; CHECK-NEXT: [[ARG_1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[ARG]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARG]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> poison, ptr [[ARG]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x ptr> [[TMP6]], <2 x ptr> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x ptr> [[TMP9]], ptr [[ARG_1]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP10]], i32 8, <4 x i1> , <4 x i64> poison) +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <2 x ptr> [[SHUFFLE1]], <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[ARG]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP6]], <4 x ptr> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x ptr> [[TMP8]], ptr [[ARG_1]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP9]], i32 8, <4 x i1> , <4 x i64> poison) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i64> [ [[TMP3]], [[IF]] ], [ [[TMP11]], [[ELSE]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP3]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[SUB:%.*]] = fsub float 6.553500e+04, undef ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[SUB]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[SUB]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: