diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -430,26 +430,6 @@ /// i32 6> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 -/// We convert this initially to something like: -/// %x0 = extractelement <4 x i8> %x, i32 0 -/// %x3 = extractelement <4 x i8> %x, i32 3 -/// %y1 = extractelement <4 x i8> %y, i32 1 -/// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 -/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 -/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 -/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 -/// %5 = mul <4 x i8> %4, %4 -/// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 -/// %7 = extractelement <4 x i8> %5, i32 1 -/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 -/// %8 = extractelement <4 x i8> %5, i32 2 -/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 -/// %9 = extractelement <4 x i8> %5, i32 3 -/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 -/// ret <4 x i8> %ins4 -/// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? @@ -1080,6 +1060,7 @@ class BoUpSLP { struct TreeEntry; struct ScheduleData; + class ShuffleCostEstimator; class ShuffleInstructionBuilder; public: @@ -2449,6 +2430,12 @@ /// \p E. Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); + /// Create a new vector from a list of scalar values. Produces a sequence + /// which exploits values reused across lanes, and arranges the inserts + /// for ease of later optimization. + template + ResTy processBuildVector(const TreeEntry *E, Args &...Params); + /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. @@ -2490,7 +2477,7 @@ void setInsertPointAfterBundle(const TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. - Value *gather(ArrayRef VL); + Value *gather(ArrayRef VL, Value *Root = nullptr); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -2548,7 +2535,7 @@ // directly, without reordering. SmallVector Mask; inversePermutation(ReorderIndices, Mask); - if (VL.size() == Scalars.size()) + if (VL.size() == Scalars.size() && ReuseShuffleIndices.empty()) return IsSame(Scalars, Mask); if (VL.size() == ReuseShuffleIndices.size()) { ::addMask(Mask, ReuseShuffleIndices); @@ -6287,68 +6274,6 @@ return {IntrinsicCost, LibCost}; } -/// Compute the cost of creating a vector of type \p VecTy containing the -/// extracted values from \p VL. -static InstructionCost -computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, - TargetTransformInfo::ShuffleKind ShuffleKind, - ArrayRef Mask, TargetTransformInfo &TTI) { - unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts || - VecTy->getNumElements() < NumOfParts) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - - bool AllConsecutive = true; - unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; - unsigned Idx = -1; - InstructionCost Cost = 0; - - // Process extracts in blocks of EltsPerVector to check if the source vector - // operand can be re-used directly. If not, add the cost of creating a shuffle - // to extract the values into a vector register. - SmallVector RegMask(EltsPerVector, UndefMaskElem); - for (auto *V : VL) { - ++Idx; - - // Reached the start of a new vector registers. - if (Idx % EltsPerVector == 0) { - RegMask.assign(EltsPerVector, UndefMaskElem); - AllConsecutive = true; - continue; - } - - // Need to exclude undefs from analysis. - if (isa(V) || Mask[Idx] == UndefMaskElem) - continue; - - // Check all extracts for a vector register on the target directly - // extract values in order. - unsigned CurrentIdx = *getExtractIndex(cast(V)); - if (!isa(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { - unsigned PrevIdx = *getExtractIndex(cast(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; - } - - if (AllConsecutive) - continue; - - // Skip all indices, except for the last index per vector block. - if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) - continue; - - // If we have a series of extracts which are not consecutive and hence - // cannot re-use the source vector register directly, compute the shuffle - // cost to extract the vector with EltsPerVector elements. - Cost += TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask); - } - return Cost; -} - /// Build shuffle mask for shuffle graph entries and lists of main and alternate /// operations operands. static void @@ -6596,9 +6521,10 @@ LocalVF = SVOpTy->getNumElements(); SmallVector ExtMask(Mask.size(), UndefMaskElem); for (auto [Idx, I] : enumerate(Mask)) { - if (I == UndefMaskElem) - continue; - ExtMask[Idx] = SV->getMaskValue(I); + if (I == UndefMaskElem || + static_cast(I) >= SV->getShuffleMask().size()) + continue; + ExtMask[Idx] = SV->getMaskValue(I); } bool IsOp1Undef = isUndefVector(SV->getOperand(0), @@ -6657,10 +6583,12 @@ /// Smart shuffle instruction emission, walks through shuffles trees and /// tries to find the best matching vector for the actual shuffle /// instruction. - template - static Value *createShuffle(Value *V1, Value *V2, ArrayRef Mask, - ShuffleBuilderTy &Builder) { + template + static T createShuffle(Value *V1, Value *V2, ArrayRef Mask, + ShuffleBuilderTy &Builder) { assert(V1 && "Expected at least one vector value."); + if (V2) + Builder.resizeToMatch(V1, V2); int VF = Mask.size(); if (auto *FTy = dyn_cast(V1->getType())) VF = FTy->getNumElements(); @@ -6735,12 +6663,9 @@ } } while (PrevOp1 != Op1 || PrevOp2 != Op2); Builder.resizeToMatch(Op1, Op2); - VF = std::max(cast(Op1->getType()) - ->getElementCount() - .getKnownMinValue(), - cast(Op2->getType()) - ->getElementCount() - .getKnownMinValue()); + VF = cast(Op1->getType()) + ->getElementCount() + .getKnownMinValue(); for (int I = 0, E = Mask.size(); I < E; ++I) { if (CombinedMask2[I] != UndefMaskElem) { assert(CombinedMask1[I] == UndefMaskElem && @@ -6748,61 +6673,120 @@ CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); } } + const int Limit = CombinedMask1.size() * 2; + if (Op1 == Op2 && Limit == 2 * VF && + all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) && + (ShuffleVectorInst::isIdentityMask(CombinedMask1) || + (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) && + isa(Op1) && + cast(Op1)->getShuffleMask() == + ArrayRef(CombinedMask1)))) + return Builder.createIdentity(Op1); return Builder.createShuffleVector( Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, CombinedMask1); } if (isa(V1)) - return PoisonValue::get(FixedVectorType::get( - cast(V1->getType())->getElementType(), Mask.size())); + return Builder.createPoison( + cast(V1->getType())->getElementType(), Mask.size()); SmallVector NewMask(Mask.begin(), Mask.end()); bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); assert(V1 && "Expected non-null value after looking through shuffles."); if (!IsIdentity) return Builder.createShuffleVector(V1, NewMask); - return V1; + return Builder.createIdentity(V1); } }; } // namespace -InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, - ArrayRef VectorizedVals) { - ArrayRef VL = E->Scalars; +/// Merges shuffle masks and emits final shuffle instruction, if required. It +/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, +/// when the actual shuffle instruction is generated only if this is actually +/// required. Otherwise, the shuffle instruction emission is delayed till the +/// end of the process, to reduce the number of emitted instructions and further +/// analysis/transformations. +class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { + bool IsFinalized = false; + SmallVector CommonMask; + SmallVector InVectors; + const TargetTransformInfo &TTI; + InstructionCost Cost = 0; + ArrayRef VectorizedVals; + BoUpSLP &R; + constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - else if (auto *IE = dyn_cast(VL[0])) - ScalarTy = IE->getOperand(1)->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + class ShuffleCostBuilder { + const TargetTransformInfo &TTI; - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = FixedVectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); - unsigned EntryVF = E->getVectorFactor(); - auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); + static bool isEmptyOrIdentity(ArrayRef Mask, unsigned VF) { + return Mask.empty() || + (VF == Mask.size() && all_of(enumerate(Mask), [](auto Pair) { + return Pair.value() == UndefMaskElem || + Pair.index() == static_cast(Pair.value()); + })); + } - bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - // FIXME: it tries to fix a problem with MSVC buildbots. - TargetTransformInfo *TTI = this->TTI; - auto AdjustExtractsCost = [=](InstructionCost &Cost, - ArrayRef Mask) -> Value * { + public: + ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} + ~ShuffleCostBuilder() = default; + InstructionCost createShuffleVector(Value *V1, Value *, + ArrayRef Mask) const { + // Empty mask or identity mask are free. + unsigned VF = + cast(V1->getType())->getElementCount().getKnownMinValue(); + if (isEmptyOrIdentity(Mask, VF)) + return TTI::TCC_Free; + return TTI.getShuffleCost( + TTI::SK_PermuteTwoSrc, + FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size()), + Mask); + } + InstructionCost createShuffleVector(Value *V1, ArrayRef Mask) const { + // Empty mask or identity mask are free. + if (isEmptyOrIdentity(Mask, Mask.size())) + return TTI::TCC_Free; + return TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, + FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size()), + Mask); + } + InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } + InstructionCost createPoison(Type *Ty, unsigned VF) const { + return TTI::TCC_Free; + } + void resizeToMatch(Value *&, Value *&) const {} + }; + + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + InstructionCost createShuffle(Value *V1, Value *V2, ArrayRef Mask) { + ShuffleCostBuilder Builder(TTI); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, + Builder); + } + +public: + ShuffleCostEstimator(TargetTransformInfo &TTI, + ArrayRef VectorizedVals, BoUpSLP &R) + : TTI(TTI), VectorizedVals(VectorizedVals), R(R) {} + Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { if (Mask.empty()) return nullptr; Value *VecBase = nullptr; + ArrayRef VL = E->Scalars; + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI->getNumberOfParts(VecTy); + unsigned VecNumParts = TTI.getNumberOfParts(VecTy); if (VecNumParts == VecTy->getNumElements()) return nullptr; DenseMap ExtractVectorsTys; SmallPtrSet CheckedExtracts; for (auto [I, V] : enumerate(VL)) { + // Ignore non-extractelement scalars. if (isa(V) || (!Mask.empty() && Mask[I] == UndefMaskElem)) continue; // If all users of instruction are going to be vectorized and this @@ -6811,9 +6795,9 @@ // vectorized tree. // Also, avoid adjusting the cost for extractelements with multiple uses // in different graph entries. - const TreeEntry *VE = getTreeEntry(V); + const TreeEntry *VE = R.getTreeEntry(V); if (!CheckedExtracts.insert(V).second || - !areAllUsersVectorized(cast(V), VectorizedVals) || + !R.areAllUsersVectorized(cast(V), VectorizedVals) || (VE && VE != E)) continue; auto *EE = cast(V); @@ -6822,7 +6806,7 @@ if (!EEIdx) continue; unsigned Idx = *EEIdx; - if (VecNumParts != TTI->getNumberOfParts(EE->getVectorOperandType())) { + if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { auto It = ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; It->getSecond() = std::min(It->second, Idx); @@ -6835,18 +6819,17 @@ })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. - Cost -= - TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); + Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); // Add back the cost of s|zext which is subtracted separately. - Cost += TTI->getCastInstrCost( + Cost += TTI.getCastInstrCost( Ext->getOpcode(), Ext->getType(), EE->getType(), TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } - Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, + Idx); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6854,256 +6837,364 @@ unsigned NumElts = VecTy->getNumElements(); if (Data.second % NumElts == 0) continue; - if (TTI->getNumberOfParts(EEVTy) > VecNumParts) { + if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); if (Idx % NumElts == 0) continue; if (Idx + NumElts <= EENumElts) { - Cost += - TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, VecTy); } else { // Need to round up the subvector type vectorization factor to avoid a // crash in cost model functions. Make SubVT so that Idx + VF of SubVT // <= EENumElts. auto *SubVT = FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += - TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, std::nullopt, CostKind, Idx, SubVT); } } else { - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + VecTy, std::nullopt, CostKind, 0, EEVTy); } } return VecBase; - }; - if (E->State == TreeEntry::NeedToGather) { - if (allConstant(VL)) - return 0; - if (isa(VL[0])) - return InstructionCost::getInvalid(); - unsigned VF = E->getVectorFactor(); - SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), - E->ReuseShuffleIndices.end()); - SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the reorder indices and reorder scalars per this - // mask. - SmallVector ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); - if (!ReorderMask.empty()) - reorderScalars(GatheredScalars, ReorderMask); - SmallVector Mask; - SmallVector ExtractMask; - std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; - Type *ScalarTy = GatheredScalars.front()->getType(); - // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - - InstructionCost Cost = 0; - bool Resized = false; - if (Value *VecBase = AdjustExtractsCost(Cost, ExtractMask)) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - - // Do not try to look for reshuffled loads for gathered loads (they will be - // handled later), for vectorized scalars, and cases, which are definitely - // not profitable (splats and small gather nodes.) - if (ExtractShuffle || E->getOpcode() != Instruction::Load || - E->isAltShuffle() || - all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || - isSplat(E->Scalars) || - (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (GatherShuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); + } + std::optional + needToDelay(const TreeEntry *, ArrayRef) const { + // No need to delay the cost estimation during analysis. + return std::nullopt; + } + void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { + // Use zeroinitializer instead of actual vector value here, since they are + // not ready yet. + add(Constant::getNullValue(FixedVectorType::get( + E1->Scalars.front()->getType(), E1->getVectorFactor())), + Constant::getNullValue(FixedVectorType::get( + E2->Scalars.front()->getType(), E2->getVectorFactor())), + Mask); + } + void add(const TreeEntry *E1, ArrayRef Mask) { + // Use zeroinitializer instead of actual vector value here, since they are + // not ready yet. + add(Constant::getNullValue(FixedVectorType::get( + E1->Scalars.front()->getType(), E1->getVectorFactor())), + Mask); + } + /// Adds 2 input vectors and the mask for their shuffling. + void add(Value *V1, Value *V2, ArrayRef Mask) { + assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); + if (InVectors.empty()) { + InVectors.push_back(V1); + InVectors.push_back(V2); + CommonMask.assign(Mask.begin(), Mask.end()); + return; + } + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } else if (cast(Vec->getType())->getNumElements() != + Mask.size()) { + Cost += createShuffle(Vec, nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } + Cost += createShuffle(V1, V2, Mask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx + Sz; + InVectors.front() = Vec; + if (InVectors.size() == 2) + InVectors.back() = V1; + else + InVectors.push_back(V1); + } + /// Adds another one input vector and the mask for the shuffling. + void add(Value *V1, ArrayRef Mask) { + if (InVectors.empty()) { + if (!isa(V1->getType())) { + Cost += createShuffle(V1, nullptr, CommonMask); + CommonMask.assign(Mask.size(), UndefMaskElem); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; } - // Remove shuffled elements from list of gathers. - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - if (Mask[I] != UndefMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); + InVectors.push_back(V1); + CommonMask.assign(Mask.begin(), Mask.end()); + return; + } + const auto *It = find(InVectors, V1); + if (It == InVectors.end()) { + if (InVectors.size() == 2 || + InVectors.front()->getType() != V1->getType() || + !isa(V1->getType())) { + Value *V = InVectors.front(); + if (InVectors.size() == 2) { + Cost += + createShuffle(InVectors.front(), InVectors.back(), CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } else if (cast(V->getType())->getNumElements() != + CommonMask.size()) { + Cost += createShuffle(InVectors.front(), nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = + V->getType() != V1->getType() + ? Idx + Sz + : Mask[Idx] + cast(V1->getType()) + ->getNumElements(); + if (V->getType() != V1->getType()) + Cost += createShuffle(V1, nullptr, Mask); + InVectors.front() = V; + if (InVectors.size() == 2) + InVectors.back() = V1; + else + InVectors.push_back(V1); + return; } - InstructionCost GatherCost = 0; - int Limit = Mask.size() * 2; - if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask)) { - // Perfect match in the graph, will reuse the previously vectorized - // node. Cost is 0. - LLVM_DEBUG( - dbgs() - << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); - if (NeedToShuffleReuses) - GatherCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - } else { - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle that starts with " - << *VL.front() << ".\n"); - // Detected that instead of gather we can emit a shuffle of single/two - // previously vectorized nodes. Add the cost of the permutation rather - // than gather. - ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); - } - if (!all_of(GatheredScalars, UndefValue::classof)) - GatherCost += getGatherCost(GatheredScalars); - return GatherCost; - } - if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { - // Check that gather of extractelements can be represented as just a - // shuffle of a single/two vectors the scalars are extracted from. - // Found the bunch of extractelement instructions that must be gathered - // into a vector and can be represented as a permutation elements in a - // single input vector or of 2 input vectors. - InstructionCost Cost = - computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); - AdjustExtractsCost(Cost, ExtractMask); - if (NeedToShuffleReuses) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - return Cost; + // Check if second vector is required if the used elements are already + // used from the first one. + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) { + InVectors.push_back(V1); + break; + } } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - assert(VecTy == FinalVecTy && - "No reused scalars expected for broadcast."); + int VF = CommonMask.size(); + if (auto *FTy = dyn_cast(V1->getType())) + VF = FTy->getNumElements(); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) + CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); + } + /// Adds another one input vector and the mask for the shuffling. + void addOrdered(Value *V1, ArrayRef Order) { + SmallVector NewMask; + inversePermutation(Order, NewMask); + add(V1, NewMask); + } + Value *gather(ArrayRef VL, Value *Root = nullptr) { + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); + auto BuildVectorCost = [&](ArrayRef VL, + Value *Root) -> InstructionCost { + InstructionCost GatherCost = 0; + SmallVector Gathers(VL.begin(), VL.end()); + BoUpSLP::ValueSet VectorizedLoads; + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + InstructionsState S = getSameOpcode(VL, *R.TLI); + if (VL.size() > 2 && S.getOpcode() == Instruction::Load && + !S.isAltShuffle() && + !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && + !isSplat(Gathers)) { + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType()); + for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = + canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE, + *R.LI, *R.TLI, CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize + // it again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (!VectorizedLoads.contains(VL[I])) + continue; + // Exclude potentially vectorized loads from list of gathered + // scalars. + for (unsigned K = I, End = I + VF; K < End; ++K) + Gathers[K] = PoisonValue::get(Gathers[K]->getType()); + } + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast(V); + ScalarsCost += TTI.getMemoryOpCost( + Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), + LI); + } + auto *LI = cast(S.MainOp); + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI); + GatherCost += ScatterVectorizeCnt * + TTI.getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + // Add the cost for the subvectors shuffling. + GatherCost += (VectorizedCnt + ScatterVectorizeCnt - 1) * + TTI.getShuffleCost(TTI::SK_Select, VecTy); + GatherCost -= ScalarsCost; + } + } else if (!Root && !allConstant(VL) && isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as + // the broadcast. const auto *It = find_if(VL, [](Value *V) { return !isa(V); }); - // If all values are undefs - consider cost free. - if (It == VL.end()) - return TTI::TCC_Free; + assert(It != VL.end() && "Expected at least one non-undef value."); // Add broadcast for non-identity shuffle only. bool NeedShuffle = count(VL, *It) > 1 && (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof)); - InstructionCost InsertCost = TTI->getVectorInstrCost( + InstructionCost InsertCost = TTI.getVectorInstrCost( Instruction::InsertElement, VecTy, CostKind, NeedShuffle ? 0 : std::distance(VL.begin(), It), PoisonValue::get(VecTy), *It); - return InsertCost + (NeedShuffle - ? TTI->getShuffleCost( - TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/std::nullopt, CostKind, - /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/*It) - : TTI::TCC_Free); - } - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) - ReuseShuffleCost = TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - if (VL.size() > 2 && E->getOpcode() == Instruction::Load && - !E->isAltShuffle()) { - BoUpSLP::ValueSet VectorizedLoads; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); - for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, - *TLI, CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; - else - ++ScatterVectorizeCnt; - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } + return InsertCost + + (NeedShuffle ? TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, VecTy, + /*Mask=*/std::nullopt, CostKind, + /*Index=*/0, /*SubTp=*/nullptr, /*Args=*/*It) + : TTI::TCC_Free); + } + return GatherCost + R.getGatherCost(Gathers); + }; + Cost += BuildVectorCost(VL, Root); + if (!Root) { + SmallVector Vals; + for (Value *V : VL) { + if (isa(V)) { + Vals.push_back(cast(V)); + continue; } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; + Vals.push_back(Constant::getNullValue(V->getType())); + } + return ConstantVector::get(Vals); + } + return ConstantVector::getSplat( + ElementCount::getFixed(VL.size()), + Constant::getNullValue(VL.front()->getType())); + } + InstructionCost createFreeze(InstructionCost Cost) { return Cost; } + /// Finalize emission of the shuffles. + InstructionCost + finalize(ArrayRef ExtMask, + function_ref &)> Action = {}) { + IsFinalized = true; + if (Action) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Cost += createShuffle(Vec, nullptr, CommonMask); } - if (!VectorizedLoads.empty()) { - InstructionCost GatherCost = 0; - unsigned NumParts = TTI->getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + Action(Vec, CommonMask); + InVectors.front() = Vec; + } + if (!ExtMask.empty()) { + if (CommonMask.empty()) { + CommonMask.assign(ExtMask.begin(), ExtMask.end()); + } else { + SmallVector NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) continue; - GatherCost += getGatherCost(VL.slice(I, VF)); - } - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += - TTI->getMemoryOpCost(Instruction::Load, LI->getType(), - LI->getAlign(), LI->getPointerAddressSpace(), - CostKind, TTI::OperandValueInfo(), LI); - } - auto *LI = cast(E->getMainOp()); - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI); - GatherCost += ScatterVectorizeCnt * - TTI->getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - for (int I = VF, E = VL.size(); I < E; I += VF) - GatherCost += - TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - std::nullopt, CostKind, I, LoadTy); - } - return ReuseShuffleCost + GatherCost - ScalarsCost; - } - } - return ReuseShuffleCost + getGatherCost(VL); + NewMask[I] = CommonMask[ExtMask[I]]; + } + CommonMask.swap(NewMask); + } + } + if (CommonMask.empty()) { + assert(InVectors.size() == 1 && "Expected only one vector with no mask"); + return Cost; + } + if (InVectors.size() == 2) + return Cost + + createShuffle(InVectors.front(), InVectors.back(), CommonMask); + return Cost + createShuffle(InVectors.front(), nullptr, CommonMask); + } + + ~ShuffleCostEstimator() { + assert((IsFinalized || CommonMask.empty()) && + "Shuffle construction must be finalized."); + } +}; + +InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, + ArrayRef VectorizedVals) { + ArrayRef VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast(VL[0])) + ScalarTy = CI->getOperand(0)->getType(); + else if (auto *IE = dyn_cast(VL[0])) + ScalarTy = IE->getOperand(1)->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(VL[0])) + VecTy = FixedVectorType::get( + IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + unsigned EntryVF = E->getVectorFactor(); + auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF); + + bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); + if (E->State == TreeEntry::NeedToGather) { + if (allConstant(VL)) + return 0; + if (isa(VL[0])) + return InstructionCost::getInvalid(); + return processBuildVector( + E, *TTI, VectorizedVals, *this); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -8861,7 +8952,7 @@ Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::gather(ArrayRef VL) { +Value *BoUpSLP::gather(ArrayRef VL, Value *Root) { // List of instructions/lanes from current block and/or the blocks which are // part of the current loop. These instructions will be inserted at the end to // make it possible to optimize loops and hoist invariant instructions out of @@ -8878,7 +8969,8 @@ for (int I = 0, E = VL.size(); I < E; ++I) { if (auto *Inst = dyn_cast(VL[I])) if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || - getTreeEntry(Inst) || (L && (L->contains(Inst)))) && + getTreeEntry(Inst) || + (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) && PostponedIndices.insert(I).second) PostponedInsts.emplace_back(Inst, I); } @@ -8901,7 +8993,7 @@ Value *Val0 = isa(VL[0]) ? cast(VL[0])->getValueOperand() : VL[0]; FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); - Value *Vec = PoisonValue::get(VecTy); + Value *Vec = Root ? Root : PoisonValue::get(VecTy); SmallVector NonConsts; // Insert constant values at first. for (int I = 0, E = VL.size(); I < E; ++I) { @@ -8911,6 +9003,18 @@ NonConsts.push_back(I); continue; } + if (Root) { + if (!isa(VL[I])) { + NonConsts.push_back(I); + continue; + } + if (isa(VL[I])) + continue; + if (auto *SV = dyn_cast(Root)) { + if (SV->getMaskValue(I) == UndefMaskElem) + continue; + } + } Vec = CreateInsertElement(Vec, VL[I], I); } // Insert non-constant values. @@ -9009,6 +9113,10 @@ } return Vec; } + Value *createIdentity(Value *V) { return V; } + Value *createPoison(Type *Ty, unsigned VF) { + return PoisonValue::get(FixedVectorType::get(Ty, VF)); + } /// Resizes 2 input vector to match the sizes, if the they are not equal /// yet. The smallest vector is resized to the size of the larger vector. void resizeToMatch(Value *&V1, Value *&V2) { @@ -9041,7 +9149,8 @@ assert(V1 && "Expected at least one vector value."); ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, R.CSEBlocks); - return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, + ShuffleBuilder); } /// Transforms mask \p CommonMask per given \p Mask to make proper set after @@ -9057,6 +9166,41 @@ ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R) : Builder(Builder), R(R) {} + Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { + Value *VecBase = nullptr; + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + int Idx = Mask[I]; + if (Idx == UndefMaskElem) + continue; + auto *EI = cast(E->Scalars[I]); + VecBase = EI->getVectorOperand(); + // If all users are vectorized - can delete the extractelement itself. + if (any_of(EI->users(), + [&](User *U) { return !R.ScalarToTreeEntry.count(U); })) + continue; + R.eraseInstruction(EI); + } + return VecBase; + } + std::optional needToDelay(const TreeEntry *E, + ArrayRef Deps) const { + // No need to delay emission if all deps are ready. + if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) + return std::nullopt; + // Postpone gather emission, will be emitted after the end of the + // process to keep correct order. + auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), + E->getVectorFactor()); + Value *Vec = Builder.CreateAlignedLoad( + VecTy, PoisonValue::get(VecTy->getPointerTo()), MaybeAlign()); + return Vec; + } + void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { + add(E1->VectorizedValue, E2->VectorizedValue, Mask); + } + void add(const TreeEntry *E1, ArrayRef Mask) { + add(E1->VectorizedValue, Mask); + } /// Adds 2 input vectors and the mask for their shuffling. void add(Value *V1, Value *V2, ArrayRef Mask) { assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); @@ -9148,10 +9292,29 @@ inversePermutation(Order, NewMask); add(V1, NewMask); } + Value *gather(ArrayRef VL, Value *Root = nullptr) { + return R.gather(VL, Root); + } + Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } /// Finalize emission of the shuffles. Value * - finalize(ArrayRef ExtMask = std::nullopt) { + finalize(ArrayRef ExtMask, + function_ref &)> Action = {}) { IsFinalized = true; + if (Action) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Vec = createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + Action(Vec, CommonMask); + InVectors.front() = Vec; + } if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -9276,101 +9439,326 @@ return vectorizeTree(I->get()); } -Value *BoUpSLP::createBuildVector(const TreeEntry *E) { +template +ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); - SmallVector Gathered( - VF, PoisonValue::get(E->Scalars.front()->getType())); bool NeedFreeze = false; - SmallVector VL(E->Scalars.begin(), E->Scalars.end()); + bool RebuiltVector = false; + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); + SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Build a mask out of the redorder indices and reorder scalars per this mask. SmallVector ReorderMask; inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) - reorderScalars(VL, ReorderMask); - SmallVector ReuseMask(VF, UndefMaskElem); - if (!allConstant(VL)) { - // For splats with can emit broadcasts instead of gathers, so try to find - // such sequences. - bool IsSplat = isSplat(VL) && (VL.size() > 2 || VL.front() == VL.back()); - SmallVector UndefPos; - DenseMap UniquePositions; - // Gather unique non-const values and all constant values. - // For repeated values, just shuffle them. - for (auto [I, V] : enumerate(VL)) { - if (isa(V)) { - if (!isa(V)) { - Gathered[I] = V; - ReuseMask[I] = I; - UndefPos.push_back(I); + reorderScalars(GatheredScalars, ReorderMask); + auto FindReusedSplat = [&](SmallVectorImpl &Mask) { + if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { + return isa(V) && !isa(V); + })) + return false; + TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; + unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; + if (UserTE->getNumOperands() != 2) + return false; + auto *It = + find_if(VectorizableTree, [=](const std::unique_ptr &TE) { + return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { + return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; + }) != TE->UserTreeIndices.end(); + }); + if (It == VectorizableTree.end()) + return false; + unsigned I = + *find_if_not(Mask, [](int Idx) { return Idx == UndefMaskElem; }); + int Sz = Mask.size(); + if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) && + ShuffleVectorInst::isIdentityMask(Mask)) + std::iota(Mask.begin(), Mask.end(), 0); + else + std::fill(Mask.begin(), Mask.end(), I); + return true; + }; + BVTy GatherBuilder(Params...); + ResTy Res = ResTy(); + SmallVector Mask; + SmallVector ExtractMask; + SmallVector ReuseMask; + std::optional ExtractShuffle; + std::optional GatherShuffle; + SmallVector Entries; + Type *ScalarTy = GatheredScalars.front()->getType(); + bool IsNonPoisoned = true; + bool IsUsedInExpr = false; + SmallVector ReusedEntries; + if (!all_of(GatheredScalars, UndefValue::classof)) { + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + // Need to remove vectorized extracelement instructions. + Value *VecBase = GatherBuilder.adjustExtracts(E, ExtractMask); + bool Resized = false; + if (VecBase) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); } - continue; + // Gather extracts after we check for full matched gathers only. + if (E->getOpcode() != Instruction::Load || E->isAltShuffle() || + all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || + isSplat(E->Scalars) || + (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + } + if (GatherShuffle) { + if (std::optional Delayed = + GatherBuilder.needToDelay(E, Entries)) { + // Delay emission of gathers which are not ready yet. + PostponedGathers.insert(E); + // Postpone gather emission, will be emitted after the end of the + // process to keep correct order. + return *Delayed; } - if (isConstant(V)) { - Gathered[I] = V; - ReuseMask[I] = I; - continue; + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 && GatheredScalars.size() != VF1) || + (VF == VF2 && GatheredScalars.size() != VF2)) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); } - if (IsSplat) { - Gathered.front() = V; - ReuseMask[I] = 0; - } else { - const auto Res = UniquePositions.try_emplace(V, I); - Gathered[Res.first->second] = V; - ReuseMask[I] = Res.first->second; - } - } - if (!UndefPos.empty() && IsSplat) { - // For undef values, try to replace them with the simple broadcast. - // We can do it if the broadcasted value is guaranteed to be - // non-poisonous, or by freezing the incoming scalar value first. - auto *It = find_if(Gathered, [this, E](Value *V) { - return !isa(V) && - (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || - any_of(V->uses(), [E](const Use &U) { - // Check if the value already used in the same operation in - // one of the nodes already. - return E->UserTreeIndices.size() == 1 && - is_contained( - E->UserTreeIndices.front().UserTE->Scalars, - U.getUser()) && - E->UserTreeIndices.front().EdgeIdx != U.getOperandNo(); - })); - }); - if (It != Gathered.end()) { - // Replace undefs by the non-poisoned scalars and emit broadcast. - int Pos = std::distance(Gathered.begin(), It); - for_each(UndefPos, [&](int I) { - // Set the undef position to the non-poisoned scalar. - ReuseMask[I] = Pos; - // Replace the undef by the poison, in the mask it is replaced by non-poisoned scalar already. - if (I != Pos) - Gathered[I] = PoisonValue::get(Gathered[I]->getType()); - }); + if (*GatherShuffle == TTI::SK_PermuteSingleSrc) + IsUsedInExpr = FindReusedSplat(Mask); + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + if (Entries.front()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); + ReusedEntries.push_back(Entries.front()); + if (Entries.size() == 1) { + GatherBuilder.add(Entries.front(), Mask); } else { - // Replace undefs by the poisons, emit broadcast and then emit - // freeze. - for_each(UndefPos, [&](int I) { - ReuseMask[I] = UndefMaskElem; - if (isa(Gathered[I])) - Gathered[I] = PoisonValue::get(Gathered[I]->getType()); + if (Entries.back()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); + GatherBuilder.add(Entries.front(), Entries.back(), Mask); + ReusedEntries.push_back(Entries.back()); + } + } else if (!allConstant(GatheredScalars)) { + // For splats we can emit broadcasts instead of gathers, so try to find + // such sequences. + bool IsSplat = isSplat(GatheredScalars) && + (GatheredScalars.size() > 2 || + GatheredScalars.front() == GatheredScalars.back()); + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + ReuseMask.assign(VF, UndefMaskElem); + SmallVector UndefPos; + DenseMap UniquePositions; + // Gather unique non-const values and all constant values. + // For repeated values, just shuffle them. + int NumNonConsts = 0; + int SinglePos = 0; + for (auto [I, V] : enumerate(GatheredScalars)) { + if (isa(V)) { + if (!isa(V)) { + ReuseMask[I] = I; + UndefPos.push_back(I); + } + continue; + } + if (isConstant(V)) { + ReuseMask[I] = I; + continue; + } + ++NumNonConsts; + SinglePos = I; + Value *OrigV = V; + V = PoisonValue::get(ScalarTy); + if (IsSplat) { + RebuiltVector |= I != 0; + GatheredScalars.front() = OrigV; + ReuseMask[I] = 0; + } else { + const auto Res = UniquePositions.try_emplace(OrigV, I); + RebuiltVector |= Res.first->second != I; + GatheredScalars[Res.first->second] = OrigV; + ReuseMask[I] = Res.first->second; + } + } + if (NumNonConsts == 1) { + // Restore single insert element. + RebuiltVector = false; + if (IsSplat) { + ReuseMask.assign(VF, UndefMaskElem); + std::swap(GatheredScalars.front(), GatheredScalars[SinglePos]); + if (!UndefPos.empty() && UndefPos.front() == 0) + GatheredScalars.front() = UndefValue::get(ScalarTy); + } + ReuseMask[SinglePos] = SinglePos; + } else if (!UndefPos.empty() && IsSplat) { + // For undef values, try to replace them with the simple broadcast. + // We can do it if the broadcasted value is guaranteed to be + // non-poisonous, or by freezing the incoming scalar value first. + auto *It = find_if(GatheredScalars, [this, E](Value *V) { + return !isa(V) && + (getTreeEntry(V) || isGuaranteedNotToBePoison(V) || + (E->UserTreeIndices.size() == 1 && + any_of(V->uses(), [E](const Use &U) { + // Check if the value already used in the same operation in + // one of the nodes already. + return E->UserTreeIndices.front().EdgeIdx != + U.getOperandNo() && + is_contained( + E->UserTreeIndices.front().UserTE->Scalars, + U.getUser()); + }))); }); - NeedFreeze = true; + if (It != GatheredScalars.end()) { + // Replace undefs by the non-poisoned scalars and emit broadcast. + int Pos = std::distance(GatheredScalars.begin(), It); + for_each(UndefPos, [&](int I) { + // Set the undef position to the non-poisoned scalar. + ReuseMask[I] = Pos; + // Replace the undef by the poison, in the mask it is replaced by + // non-poisoned scalar already. + if (I != Pos) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + }); + } else { + // Replace undefs by the poisons, emit broadcast and then emit + // freeze. + for_each(UndefPos, [&](int I) { + ReuseMask[I] = UndefMaskElem; + if (isa(GatheredScalars[I])) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + }); + NeedFreeze = true; + } } } + } + // Combine generated extracts mask and reused scalars masks and + // corresponding input vectors. + if (ExtractShuffle) { + // Gather of extractelements can be represented as just a shuffle of + // a single/two vectors the scalars are extracted from. + // Find input vectors. + Value *Vec1 = nullptr; + Value *Vec2 = nullptr; + if (*ExtractShuffle == TTI::SK_PermuteSingleSrc) + IsUsedInExpr = FindReusedSplat(ExtractMask); + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == UndefMaskElem || + (!Mask.empty() && Mask[I] != UndefMaskElem)) { + ExtractMask[I] = UndefMaskElem; + continue; + } + if (isa(E->Scalars[I])) + continue; + auto *EI = cast(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } + } + if (Vec2) { + IsNonPoisoned &= + isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); + GatherBuilder.add(Vec1, Vec2, ExtractMask); + } else if (Vec1) { + IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); + GatherBuilder.add(Vec1, ExtractMask); + } else { + GatherBuilder.add(PoisonValue::get(FixedVectorType::get( + ScalarTy, GatheredScalars.size())), + ExtractMask); + } + } + if (ExtractShuffle || GatherShuffle) { + // Insert non-constant scalars. + SmallVector NonConstants(GatheredScalars); + int EMSz = ExtractMask.size(); + int MSz = Mask.size(); + bool EnoughConsts = + !RebuiltVector && (!ExtractShuffle || !GatherShuffle) && + ((ExtractShuffle && + (*ExtractShuffle != TTI::SK_PermuteSingleSrc || + any_of(ExtractMask, [&](int I) { return I >= EMSz; }) || + !ShuffleVectorInst::isIdentityMask(ExtractMask))) || + (GatherShuffle && (*GatherShuffle != TTI::SK_PermuteSingleSrc || + any_of(Mask, [&](int I) { return I >= MSz; }) || + !ShuffleVectorInst::isIdentityMask(Mask))) || + count_if(GatheredScalars, [](Value *V) { + return isa(V) && !isa(V); + }) > 1); + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (EnoughConsts && isa(GatheredScalars[I])) + NonConstants[I] = PoisonValue::get(ScalarTy); + else + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + // Generate constants for final shuffle. + if (!all_of(GatheredScalars, UndefValue::classof)) { + Mask.assign(GatheredScalars.size(), UndefMaskElem); + Value *VecVal = GatherBuilder.gather(GatheredScalars); + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (!isa(GatheredScalars[I])) + Mask[I] = I; + } + GatherBuilder.add(VecVal, Mask); + IsNonPoisoned &= isGuaranteedNotToBePoison(VecVal); + } + NeedFreeze = !IsNonPoisoned && !IsUsedInExpr && + any_of(GatheredScalars, [](Value *V) { + return isa(V) && !isa(V); + }); + // Emit final insertelement instructions for defined values. + if (!RebuiltVector && !all_of(NonConstants, UndefValue::classof)) { + Res = GatherBuilder.finalize( + ReuseShuffleIndicies, [&](Value *&Vec, SmallVectorImpl &Mask) { + Vec = GatherBuilder.gather(NonConstants, Vec); + for (unsigned I = 0, Sz = Mask.size(); I < Sz; ++I) + if ((!EnoughConsts && !isa(NonConstants[I])) || + !isa(NonConstants[I])) + Mask[I] = I; + }); + } else { + if (RebuiltVector && !all_of(NonConstants, UndefValue::classof)) { + // Just generate simple gather, no reused scalars/extracts. + Value *BV = GatherBuilder.gather(NonConstants); + GatherBuilder.add(BV, ReuseMask); + } + Res = GatherBuilder.finalize(ReuseShuffleIndicies); + } } else { - ReuseMask.clear(); - copy(VL, Gathered.begin()); + // Just generate simple gather, no reused scalars/extracts. + Value *BV = GatherBuilder.gather(GatheredScalars); + GatherBuilder.add(BV, ReuseMask); + Res = GatherBuilder.finalize(ReuseShuffleIndicies); } - // Gather unique scalars and all constants. - Value *Vec = gather(Gathered); - ShuffleBuilder.add(Vec, ReuseMask); - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); if (NeedFreeze) - Vec = Builder.CreateFreeze(Vec); - return Vec; + Res = GatherBuilder.createFreeze(Res); + return Res; +} + +Value *BoUpSLP::createBuildVector(const TreeEntry *E) { + return processBuildVector(E, Builder, + *this); } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -9381,6 +9769,14 @@ return E->VectorizedValue; } + if (E->State == TreeEntry::NeedToGather) { + if (E->getMainOp() && E->Idx == 0) + setInsertPointAfterBundle(E); + Value *Vec = createBuildVector(E); + E->VectorizedValue = Vec; + return Vec; + } + auto FinalShuffle = [&](Value *V, const TreeEntry *E) { ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); if (E->State != TreeEntry::NeedToGather && @@ -9394,199 +9790,6 @@ } return ShuffleBuilder.finalize(E->ReuseShuffleIndices); }; - - if (E->State == TreeEntry::NeedToGather) { - if (E->getMainOp() && E->Idx == 0) - setInsertPointAfterBundle(E); - unsigned VF = E->getVectorFactor(); - auto AdjustExtracts = [&](const TreeEntry *E, ArrayRef Mask) { - Value *VecBase = nullptr; - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - int Idx = Mask[I]; - if (Idx == UndefMaskElem) - continue; - auto *EI = cast(E->Scalars[I]); - VecBase = EI->getVectorOperand(); - // TODO: EI can be erased, if all its users are vectorized. But need to - // emit shuffles for such extractelement instructions. - } - return VecBase; - }; - auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef Mask) { - unsigned VF1 = cast(V1->getType())->getNumElements(); - unsigned VF2 = cast(V2->getType())->getNumElements(); - unsigned VF = std::max(VF1, VF2); - if (VF1 != VF2) { - SmallVector ExtMask(VF, UndefMaskElem); - std::iota(ExtMask.begin(), - std::next(ExtMask.begin(), std::min(VF1, VF2)), 0); - if (VF1 < VF2) { - V1 = Builder.CreateShuffleVector(V1, ExtMask); - if (auto *I = dyn_cast(V1)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } else { - V2 = Builder.CreateShuffleVector(V2, ExtMask); - if (auto *I = dyn_cast(V2)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } - } - const int Limit = Mask.size() * 2; - if (V1 == V2 && Mask.size() == VF && - all_of(Mask, [=](int Idx) { return Idx < Limit; }) && - (ShuffleVectorInst::isIdentityMask(Mask) || - (ShuffleVectorInst::isZeroEltSplatMask(Mask) && - isa(V1) && - cast(V1)->getShuffleMask() == Mask))) - return V1; - Value *Vec = V1 == V2 ? Builder.CreateShuffleVector(V1, Mask) - : Builder.CreateShuffleVector(V1, V2, Mask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; - }; - auto NeedToDelay = [=](const TreeEntry *E, - ArrayRef Deps) -> Value * { - // No need to delay emission if all deps are ready. - if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) - return nullptr; - // Postpone gather emission, will be emitted after the end of the - // process to keep correct order. - auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), - E->getVectorFactor()); - Value *Vec = Builder.CreateAlignedLoad( - VecTy, PoisonValue::get(VecTy->getPointerTo()), MaybeAlign()); - return Vec; - }; - - SmallVector - ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), - E->ReuseShuffleIndices.end()); - SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the reorder indices and reorder scalars per this - // mask. - SmallVector ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); - if (!ReorderMask.empty()) - reorderScalars(GatheredScalars, ReorderMask); - Value *Vec = nullptr; - SmallVector Mask; - SmallVector ExtractMask; - std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; - Type *ScalarTy = GatheredScalars.front()->getType(); - if (!all_of(GatheredScalars, UndefValue::classof)) { - // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - bool Resized = false; - if (Value *VecBase = AdjustExtracts(E, ExtractMask)) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && - GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - // Gather extracts after we check for full matched gathers only. - if (ExtractShuffle || E->getOpcode() != Instruction::Load || - E->isAltShuffle() || - all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || - isSplat(E->Scalars) || - (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { - GatherShuffle = - isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - } - if (GatherShuffle) { - if (Value *Delayed = NeedToDelay(E, Entries)) { - E->VectorizedValue = Delayed; - // Delay emission of gathers which are not ready yet. - PostponedGathers.insert(E); - // Postpone gather emission, will be emitted after the end of the - // process to keep correct order. - return Delayed; - } - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - // Remove shuffled elements from list of gathers. - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - if (Mask[I] != UndefMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - } - } - } - if ((ExtractShuffle || GatherShuffle) && - all_of(GatheredScalars, PoisonValue::classof)) { - Value *Vec1 = nullptr; - if (ExtractShuffle) { - // Gather of extractelements can be represented as just a shuffle of - // a single/two vectors the scalars are extracted from. - // Find input vectors. - Value *Vec2 = nullptr; - for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (ExtractMask[I] == UndefMaskElem || - (!Mask.empty() && Mask[I] != UndefMaskElem)) { - ExtractMask[I] = UndefMaskElem; - continue; - } - if (isa(E->Scalars[I])) - continue; - auto *EI = cast(E->Scalars[I]); - if (!Vec1) { - Vec1 = EI->getVectorOperand(); - } else if (Vec1 != EI->getVectorOperand()) { - assert((!Vec2 || Vec2 == EI->getVectorOperand()) && - "Expected only 1 or 2 vectors shuffle."); - Vec2 = EI->getVectorOperand(); - } - } - if (Vec2) - Vec1 = CreateShuffle(Vec1, Vec2, ExtractMask); - else if (Vec1) - Vec1 = CreateShuffle(Vec1, Vec1, ExtractMask); - else - Vec1 = PoisonValue::get( - FixedVectorType::get(ScalarTy, GatheredScalars.size())); - } - if (GatherShuffle) { - Vec = CreateShuffle(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (Vec1) { - // Build final mask. - for (auto [I, Idx] : enumerate(Mask)) { - if (ExtractMask[I] != UndefMaskElem) - Idx = I; - else if (Idx != UndefMaskElem) - Idx = I + VF; - } - Vec = CreateShuffle(Vec1, Vec, Mask); - } - } else { - Vec = Vec1; - } - Vec = FinalShuffle(Vec, E); - } else { - Vec = createBuildVector(E); - } - E->VectorizedValue = Vec; - return Vec; - } - assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll --- a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll @@ -23,10 +23,11 @@ ;; the vector store that replaces them. ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32), metadata ![[ID:[0-9]+]], metadata ptr %arrayidx, metadata !DIExpression()) +; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID]] ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 4)) -; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 8)) -; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]] -; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12)) +; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID1:[0-9]+]], metadata ptr %arrayidx7, metadata !DIExpression()) +; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID1]] +; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID1]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12)) target triple = "x86_64-unknown-unknown" diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -3,27 +3,21 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; CHECK-NEXT: br label [[TMP17:%.*]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] -; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 -; CHECK-NEXT: br label [[TMP17]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-NEXT: br label [[TMP11:%.*]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: br label [[TMP11]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -343,15 +343,16 @@ ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) -; CHECK-NEXT: ret i16 [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP2]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i16 [[OP_RDX]] ; entry: %0 = load i16, ptr %x, align 2 @@ -420,29 +421,34 @@ ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) -; CHECK-NEXT: ret i32 [[TMP29]] +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]]) +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP23]] +; CHECK-NEXT: ret i32 [[OP_RDX2]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -708,29 +714,29 @@ ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1 ; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[X]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[Y]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7 ; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] ; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], ptr [[ARRAYIDX72]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 ; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 ; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], ptr [[ARRAYIDX84]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -833,14 +839,17 @@ ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[X]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[Y]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], ptr [[DST0:%.*]], align 2 +; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i16, ptr [[DST0:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i16> [[TMP5]], [[TMP4]] +; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[DST0]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> +; CHECK-NEXT: store <4 x i16> [[TMP7]], ptr [[DST4]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -921,30 +930,30 @@ ; CHECK-NEXT: [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4 ; CHECK-NEXT: [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8 ; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP10]], [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP18]], [[TMP21]] -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw <4 x i32> [[TMP26]], [[TMP29]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[DST0]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[DST4]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[DST8]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP30]], ptr [[DST12]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[DST0]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[DST4]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[DST8]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[DST12]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -1198,88 +1207,421 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[P1:%.*]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[P2:%.*]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV2]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[CONV4:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 [[CONV4]], [[CONV6]] +; CHECK-NEXT: [[SHL:%.*]] = shl nsw i32 [[SUB7]], 16 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SHL]], [[SUB]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[TMP5]] to i32 +; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 [[CONV9]], [[CONV11]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 5 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13]], align 1 +; CHECK-NEXT: [[CONV14:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15]], align 1 +; CHECK-NEXT: [[CONV16:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[SUB17:%.*]] = sub nsw i32 [[CONV14]], [[CONV16]] +; CHECK-NEXT: [[SHL18:%.*]] = shl nsw i32 [[SUB17]], 16 +; CHECK-NEXT: [[ADD19:%.*]] = add nsw i32 [[SHL18]], [[SUB12]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[CONV21:%.*]] = zext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22]], align 1 +; CHECK-NEXT: [[CONV23:%.*]] = zext i8 [[TMP9]] to i32 +; CHECK-NEXT: [[SUB24:%.*]] = sub nsw i32 [[CONV21]], [[CONV23]] +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX25]], align 1 +; CHECK-NEXT: [[CONV26:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 6 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX27]], align 1 +; CHECK-NEXT: [[CONV28:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[SUB29:%.*]] = sub nsw i32 [[CONV26]], [[CONV28]] +; CHECK-NEXT: [[SHL30:%.*]] = shl nsw i32 [[SUB29]], 16 +; CHECK-NEXT: [[ADD31:%.*]] = add nsw i32 [[SHL30]], [[SUB24]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 +; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX34]], align 1 +; CHECK-NEXT: [[CONV35:%.*]] = zext i8 [[TMP13]] to i32 +; CHECK-NEXT: [[SUB36:%.*]] = sub nsw i32 [[CONV33]], [[CONV35]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37]], align 1 +; CHECK-NEXT: [[CONV38:%.*]] = zext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39]], align 1 +; CHECK-NEXT: [[CONV40:%.*]] = zext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[SUB41:%.*]] = sub nsw i32 [[CONV38]], [[CONV40]] +; CHECK-NEXT: [[SHL42:%.*]] = shl nsw i32 [[SUB41]], 16 +; CHECK-NEXT: [[ADD43:%.*]] = add nsw i32 [[SHL42]], [[SUB36]] +; CHECK-NEXT: [[ADD44:%.*]] = add nsw i32 [[ADD19]], [[ADD]] +; CHECK-NEXT: [[SUB45:%.*]] = sub nsw i32 [[ADD]], [[ADD19]] +; CHECK-NEXT: [[ADD46:%.*]] = add nsw i32 [[ADD43]], [[ADD31]] +; CHECK-NEXT: [[SUB47:%.*]] = sub nsw i32 [[ADD31]], [[ADD43]] +; CHECK-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD46]], [[ADD44]] +; CHECK-NEXT: [[SUB51:%.*]] = sub nsw i32 [[ADD44]], [[ADD46]] +; CHECK-NEXT: [[ADD55:%.*]] = add nsw i32 [[SUB47]], [[SUB45]] +; CHECK-NEXT: [[SUB59:%.*]] = sub nsw i32 [[SUB45]], [[SUB47]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[CONV2_1:%.*]] = zext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[CONV_1]], [[CONV2_1]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[CONV4_1:%.*]] = zext i8 [[TMP18]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[CONV6_1:%.*]] = zext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 [[CONV4_1]], [[CONV6_1]] +; CHECK-NEXT: [[SHL_1:%.*]] = shl nsw i32 [[SUB7_1]], 16 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[SHL_1]], [[SUB_1]] +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[CONV9_1:%.*]] = zext i8 [[TMP20]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_1]], align 1 +; CHECK-NEXT: [[CONV11_1:%.*]] = zext i8 [[TMP21]] to i32 +; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 [[CONV9_1]], [[CONV11_1]] +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_1]], align 1 +; CHECK-NEXT: [[CONV14_1:%.*]] = zext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 5 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_1]], align 1 +; CHECK-NEXT: [[CONV16_1:%.*]] = zext i8 [[TMP23]] to i32 +; CHECK-NEXT: [[SUB17_1:%.*]] = sub nsw i32 [[CONV14_1]], [[CONV16_1]] +; CHECK-NEXT: [[SHL18_1:%.*]] = shl nsw i32 [[SUB17_1]], 16 +; CHECK-NEXT: [[ADD19_1:%.*]] = add nsw i32 [[SHL18_1]], [[SUB12_1]] +; CHECK-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_1]], align 1 +; CHECK-NEXT: [[CONV21_1:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1 +; CHECK-NEXT: [[CONV23_1:%.*]] = zext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[SUB24_1:%.*]] = sub nsw i32 [[CONV21_1]], [[CONV23_1]] +; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1 +; CHECK-NEXT: [[CONV26_1:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 +; CHECK-NEXT: [[CONV28_1:%.*]] = zext i8 [[TMP27]] to i32 +; CHECK-NEXT: [[SUB29_1:%.*]] = sub nsw i32 [[CONV26_1]], [[CONV28_1]] +; CHECK-NEXT: [[SHL30_1:%.*]] = shl nsw i32 [[SUB29_1]], 16 +; CHECK-NEXT: [[ADD31_1:%.*]] = add nsw i32 [[SHL30_1]], [[SUB24_1]] +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP28]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 3 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_1]], align 1 +; CHECK-NEXT: [[CONV35_1:%.*]] = zext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[SUB36_1:%.*]] = sub nsw i32 [[CONV33_1]], [[CONV35_1]] +; CHECK-NEXT: [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_1]], align 1 +; CHECK-NEXT: [[CONV38_1:%.*]] = zext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_1]], align 1 +; CHECK-NEXT: [[CONV40_1:%.*]] = zext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[SUB41_1:%.*]] = sub nsw i32 [[CONV38_1]], [[CONV40_1]] +; CHECK-NEXT: [[SHL42_1:%.*]] = shl nsw i32 [[SUB41_1]], 16 +; CHECK-NEXT: [[ADD43_1:%.*]] = add nsw i32 [[SHL42_1]], [[SUB36_1]] +; CHECK-NEXT: [[ADD44_1:%.*]] = add nsw i32 [[ADD19_1]], [[ADD_1]] +; CHECK-NEXT: [[SUB45_1:%.*]] = sub nsw i32 [[ADD_1]], [[ADD19_1]] +; CHECK-NEXT: [[ADD46_1:%.*]] = add nsw i32 [[ADD43_1]], [[ADD31_1]] +; CHECK-NEXT: [[SUB47_1:%.*]] = sub nsw i32 [[ADD31_1]], [[ADD43_1]] +; CHECK-NEXT: [[ADD48_1:%.*]] = add nsw i32 [[ADD46_1]], [[ADD44_1]] +; CHECK-NEXT: [[SUB51_1:%.*]] = sub nsw i32 [[ADD44_1]], [[ADD46_1]] +; CHECK-NEXT: [[ADD55_1:%.*]] = add nsw i32 [[SUB47_1]], [[SUB45_1]] +; CHECK-NEXT: [[SUB59_1:%.*]] = sub nsw i32 [[SUB45_1]], [[SUB47_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32 +; CHECK-NEXT: [[SUB_2:%.*]] = sub nsw i32 [[CONV_2]], [[CONV2_2]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32 +; CHECK-NEXT: [[SUB7_2:%.*]] = sub nsw i32 [[CONV4_2]], [[CONV6_2]] +; CHECK-NEXT: [[SHL_2:%.*]] = shl nsw i32 [[SUB7_2]], 16 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[SHL_2]], [[SUB_2]] +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1 +; CHECK-NEXT: [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32 +; CHECK-NEXT: [[SUB12_2:%.*]] = sub nsw i32 [[CONV9_2]], [[CONV11_2]] +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1 +; CHECK-NEXT: [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5 +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1 +; CHECK-NEXT: [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32 +; CHECK-NEXT: [[SUB17_2:%.*]] = sub nsw i32 [[CONV14_2]], [[CONV16_2]] +; CHECK-NEXT: [[SHL18_2:%.*]] = shl nsw i32 [[SUB17_2]], 16 +; CHECK-NEXT: [[ADD19_2:%.*]] = add nsw i32 [[SHL18_2]], [[SUB12_2]] +; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1 +; CHECK-NEXT: [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1 +; CHECK-NEXT: [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32 +; CHECK-NEXT: [[SUB24_2:%.*]] = sub nsw i32 [[CONV21_2]], [[CONV23_2]] +; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1 +; CHECK-NEXT: [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1 +; CHECK-NEXT: [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32 +; CHECK-NEXT: [[SUB29_2:%.*]] = sub nsw i32 [[CONV26_2]], [[CONV28_2]] +; CHECK-NEXT: [[SHL30_2:%.*]] = shl nsw i32 [[SUB29_2]], 16 +; CHECK-NEXT: [[ADD31_2:%.*]] = add nsw i32 [[SHL30_2]], [[SUB24_2]] +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1 +; CHECK-NEXT: [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3 +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1 +; CHECK-NEXT: [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32 +; CHECK-NEXT: [[SUB36_2:%.*]] = sub nsw i32 [[CONV33_2]], [[CONV35_2]] +; CHECK-NEXT: [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1 +; CHECK-NEXT: [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1 +; CHECK-NEXT: [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32 +; CHECK-NEXT: [[SUB41_2:%.*]] = sub nsw i32 [[CONV38_2]], [[CONV40_2]] +; CHECK-NEXT: [[SHL42_2:%.*]] = shl nsw i32 [[SUB41_2]], 16 +; CHECK-NEXT: [[ADD43_2:%.*]] = add nsw i32 [[SHL42_2]], [[SUB36_2]] +; CHECK-NEXT: [[ADD44_2:%.*]] = add nsw i32 [[ADD19_2]], [[ADD_2]] +; CHECK-NEXT: [[SUB45_2:%.*]] = sub nsw i32 [[ADD_2]], [[ADD19_2]] +; CHECK-NEXT: [[ADD46_2:%.*]] = add nsw i32 [[ADD43_2]], [[ADD31_2]] +; CHECK-NEXT: [[SUB47_2:%.*]] = sub nsw i32 [[ADD31_2]], [[ADD43_2]] +; CHECK-NEXT: [[ADD48_2:%.*]] = add nsw i32 [[ADD46_2]], [[ADD44_2]] +; CHECK-NEXT: [[SUB51_2:%.*]] = sub nsw i32 [[ADD44_2]], [[ADD46_2]] +; CHECK-NEXT: [[ADD55_2:%.*]] = add nsw i32 [[SUB47_2]], [[SUB45_2]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub nsw i32 [[SUB45_2]], [[SUB47_2]] ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 +; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32 +; CHECK-NEXT: [[SUB_3:%.*]] = sub nsw i32 [[CONV_3]], [[CONV2_3]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = sub nsw <16 x i32> [[TMP31]], [[TMP39]] -; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP42]], <4 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP50]], <4 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]] -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP68]] -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> -; CHECK-NEXT: [[TMP76:%.*]] = lshr <16 x i32> [[TMP75]], -; CHECK-NEXT: [[TMP77:%.*]] = and <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = add <16 x i32> [[TMP78]], [[TMP75]] -; CHECK-NEXT: [[TMP80:%.*]] = xor <16 x i32> [[TMP79]], [[TMP78]] -; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP80]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP81]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP81]], 16 +; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32 +; CHECK-NEXT: [[SUB7_3:%.*]] = sub nsw i32 [[CONV4_3]], [[CONV6_3]] +; CHECK-NEXT: [[SHL_3:%.*]] = shl nsw i32 [[SUB7_3]], 16 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[SHL_3]], [[SUB_3]] +; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1 +; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1 +; CHECK-NEXT: [[CONV9_3:%.*]] = zext i8 [[TMP52]] to i32 +; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1 +; CHECK-NEXT: [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1 +; CHECK-NEXT: [[CONV11_3:%.*]] = zext i8 [[TMP53]] to i32 +; CHECK-NEXT: [[SUB12_3:%.*]] = sub nsw i32 [[CONV9_3]], [[CONV11_3]] +; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5 +; CHECK-NEXT: [[TMP54:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1 +; CHECK-NEXT: [[CONV14_3:%.*]] = zext i8 [[TMP54]] to i32 +; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5 +; CHECK-NEXT: [[TMP55:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1 +; CHECK-NEXT: [[CONV16_3:%.*]] = zext i8 [[TMP55]] to i32 +; CHECK-NEXT: [[SUB17_3:%.*]] = sub nsw i32 [[CONV14_3]], [[CONV16_3]] +; CHECK-NEXT: [[SHL18_3:%.*]] = shl nsw i32 [[SUB17_3]], 16 +; CHECK-NEXT: [[ADD19_3:%.*]] = add nsw i32 [[SHL18_3]], [[SUB12_3]] +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2 +; CHECK-NEXT: [[TMP56:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1 +; CHECK-NEXT: [[CONV21_3:%.*]] = zext i8 [[TMP56]] to i32 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2 +; CHECK-NEXT: [[TMP57:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1 +; CHECK-NEXT: [[CONV23_3:%.*]] = zext i8 [[TMP57]] to i32 +; CHECK-NEXT: [[SUB24_3:%.*]] = sub nsw i32 [[CONV21_3]], [[CONV23_3]] +; CHECK-NEXT: [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6 +; CHECK-NEXT: [[TMP58:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1 +; CHECK-NEXT: [[CONV26_3:%.*]] = zext i8 [[TMP58]] to i32 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6 +; CHECK-NEXT: [[TMP59:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[CONV28_3:%.*]] = zext i8 [[TMP59]] to i32 +; CHECK-NEXT: [[SUB29_3:%.*]] = sub nsw i32 [[CONV26_3]], [[CONV28_3]] +; CHECK-NEXT: [[SHL30_3:%.*]] = shl nsw i32 [[SUB29_3]], 16 +; CHECK-NEXT: [[ADD31_3:%.*]] = add nsw i32 [[SHL30_3]], [[SUB24_3]] +; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3 +; CHECK-NEXT: [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1 +; CHECK-NEXT: [[CONV33_3:%.*]] = zext i8 [[TMP60]] to i32 +; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3 +; CHECK-NEXT: [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1 +; CHECK-NEXT: [[CONV35_3:%.*]] = zext i8 [[TMP61]] to i32 +; CHECK-NEXT: [[SUB36_3:%.*]] = sub nsw i32 [[CONV33_3]], [[CONV35_3]] +; CHECK-NEXT: [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7 +; CHECK-NEXT: [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1 +; CHECK-NEXT: [[CONV38_3:%.*]] = zext i8 [[TMP62]] to i32 +; CHECK-NEXT: [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7 +; CHECK-NEXT: [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1 +; CHECK-NEXT: [[CONV40_3:%.*]] = zext i8 [[TMP63]] to i32 +; CHECK-NEXT: [[SUB41_3:%.*]] = sub nsw i32 [[CONV38_3]], [[CONV40_3]] +; CHECK-NEXT: [[SHL42_3:%.*]] = shl nsw i32 [[SUB41_3]], 16 +; CHECK-NEXT: [[ADD43_3:%.*]] = add nsw i32 [[SHL42_3]], [[SUB36_3]] +; CHECK-NEXT: [[ADD44_3:%.*]] = add nsw i32 [[ADD19_3]], [[ADD_3]] +; CHECK-NEXT: [[SUB45_3:%.*]] = sub nsw i32 [[ADD_3]], [[ADD19_3]] +; CHECK-NEXT: [[ADD46_3:%.*]] = add nsw i32 [[ADD43_3]], [[ADD31_3]] +; CHECK-NEXT: [[SUB47_3:%.*]] = sub nsw i32 [[ADD31_3]], [[ADD43_3]] +; CHECK-NEXT: [[ADD48_3:%.*]] = add nsw i32 [[ADD46_3]], [[ADD44_3]] +; CHECK-NEXT: [[SUB51_3:%.*]] = sub nsw i32 [[ADD44_3]], [[ADD46_3]] +; CHECK-NEXT: [[ADD55_3:%.*]] = add nsw i32 [[SUB47_3]], [[SUB45_3]] +; CHECK-NEXT: [[SUB59_3:%.*]] = sub nsw i32 [[SUB45_3]], [[SUB47_3]] +; CHECK-NEXT: [[ADD78:%.*]] = add nsw i32 [[ADD48_1]], [[ADD48]] +; CHECK-NEXT: [[SUB86:%.*]] = sub nsw i32 [[ADD48]], [[ADD48_1]] +; CHECK-NEXT: [[ADD94:%.*]] = add nsw i32 [[ADD48_3]], [[ADD48_2]] +; CHECK-NEXT: [[SUB102:%.*]] = sub nsw i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 [[ADD94]], [[ADD78]] +; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 [[ADD78]], [[ADD94]] +; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 [[SUB102]], [[SUB86]] +; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 [[SUB86]], [[SUB102]] +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] +; CHECK-NEXT: [[SHR_I184:%.*]] = lshr i32 [[ADD105]], 15 +; CHECK-NEXT: [[AND_I185:%.*]] = and i32 [[SHR_I184]], 65537 +; CHECK-NEXT: [[MUL_I186:%.*]] = mul nuw i32 [[AND_I185]], 65535 +; CHECK-NEXT: [[ADD_I187:%.*]] = add i32 [[MUL_I186]], [[ADD105]] +; CHECK-NEXT: [[XOR_I188:%.*]] = xor i32 [[ADD_I187]], [[MUL_I186]] +; CHECK-NEXT: [[SHR_I189:%.*]] = lshr i32 [[SUB104]], 15 +; CHECK-NEXT: [[AND_I190:%.*]] = and i32 [[SHR_I189]], 65537 +; CHECK-NEXT: [[MUL_I191:%.*]] = mul nuw i32 [[AND_I190]], 65535 +; CHECK-NEXT: [[ADD_I192:%.*]] = add i32 [[MUL_I191]], [[SUB104]] +; CHECK-NEXT: [[XOR_I193:%.*]] = xor i32 [[ADD_I192]], [[MUL_I191]] +; CHECK-NEXT: [[SHR_I194:%.*]] = lshr i32 [[SUB106]], 15 +; CHECK-NEXT: [[AND_I195:%.*]] = and i32 [[SHR_I194]], 65537 +; CHECK-NEXT: [[MUL_I196:%.*]] = mul nuw i32 [[AND_I195]], 65535 +; CHECK-NEXT: [[ADD_I197:%.*]] = add i32 [[MUL_I196]], [[SUB106]] +; CHECK-NEXT: [[XOR_I198:%.*]] = xor i32 [[ADD_I197]], [[MUL_I196]] +; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I188]], [[XOR_I]] +; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I193]] +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I198]] +; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 [[ADD55_1]], [[ADD55]] +; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 [[ADD55]], [[ADD55_1]] +; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 [[ADD55_3]], [[ADD55_2]] +; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 [[ADD55_2]], [[ADD55_3]] +; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] +; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 +; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 +; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 +; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] +; CHECK-NEXT: [[SHR_I184_1:%.*]] = lshr i32 [[ADD105_1]], 15 +; CHECK-NEXT: [[AND_I185_1:%.*]] = and i32 [[SHR_I184_1]], 65537 +; CHECK-NEXT: [[MUL_I186_1:%.*]] = mul nuw i32 [[AND_I185_1]], 65535 +; CHECK-NEXT: [[ADD_I187_1:%.*]] = add i32 [[MUL_I186_1]], [[ADD105_1]] +; CHECK-NEXT: [[XOR_I188_1:%.*]] = xor i32 [[ADD_I187_1]], [[MUL_I186_1]] +; CHECK-NEXT: [[SHR_I189_1:%.*]] = lshr i32 [[SUB104_1]], 15 +; CHECK-NEXT: [[AND_I190_1:%.*]] = and i32 [[SHR_I189_1]], 65537 +; CHECK-NEXT: [[MUL_I191_1:%.*]] = mul nuw i32 [[AND_I190_1]], 65535 +; CHECK-NEXT: [[ADD_I192_1:%.*]] = add i32 [[MUL_I191_1]], [[SUB104_1]] +; CHECK-NEXT: [[XOR_I193_1:%.*]] = xor i32 [[ADD_I192_1]], [[MUL_I191_1]] +; CHECK-NEXT: [[SHR_I194_1:%.*]] = lshr i32 [[SUB106_1]], 15 +; CHECK-NEXT: [[AND_I195_1:%.*]] = and i32 [[SHR_I194_1]], 65537 +; CHECK-NEXT: [[MUL_I196_1:%.*]] = mul nuw i32 [[AND_I195_1]], 65535 +; CHECK-NEXT: [[ADD_I197_1:%.*]] = add i32 [[MUL_I196_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I198_1:%.*]] = xor i32 [[ADD_I197_1]], [[MUL_I196_1]] +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I188_1]], [[ADD113]] +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I193_1]] +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I198_1]] +; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 [[SUB51_1]], [[SUB51]] +; CHECK-NEXT: [[SUB86_2:%.*]] = sub nsw i32 [[SUB51]], [[SUB51_1]] +; CHECK-NEXT: [[ADD94_2:%.*]] = add nsw i32 [[SUB51_3]], [[SUB51_2]] +; CHECK-NEXT: [[SUB102_2:%.*]] = sub nsw i32 [[SUB51_2]], [[SUB51_3]] +; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 [[ADD94_2]], [[ADD78_2]] +; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], [[ADD94_2]] +; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 [[SUB102_2]], [[SUB86_2]] +; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 [[SUB86_2]], [[SUB102_2]] +; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 +; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 +; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 +; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] +; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] +; CHECK-NEXT: [[SHR_I184_2:%.*]] = lshr i32 [[ADD105_2]], 15 +; CHECK-NEXT: [[AND_I185_2:%.*]] = and i32 [[SHR_I184_2]], 65537 +; CHECK-NEXT: [[MUL_I186_2:%.*]] = mul nuw i32 [[AND_I185_2]], 65535 +; CHECK-NEXT: [[ADD_I187_2:%.*]] = add i32 [[MUL_I186_2]], [[ADD105_2]] +; CHECK-NEXT: [[XOR_I188_2:%.*]] = xor i32 [[ADD_I187_2]], [[MUL_I186_2]] +; CHECK-NEXT: [[SHR_I189_2:%.*]] = lshr i32 [[SUB104_2]], 15 +; CHECK-NEXT: [[AND_I190_2:%.*]] = and i32 [[SHR_I189_2]], 65537 +; CHECK-NEXT: [[MUL_I191_2:%.*]] = mul nuw i32 [[AND_I190_2]], 65535 +; CHECK-NEXT: [[ADD_I192_2:%.*]] = add i32 [[MUL_I191_2]], [[SUB104_2]] +; CHECK-NEXT: [[XOR_I193_2:%.*]] = xor i32 [[ADD_I192_2]], [[MUL_I191_2]] +; CHECK-NEXT: [[SHR_I194_2:%.*]] = lshr i32 [[SUB106_2]], 15 +; CHECK-NEXT: [[AND_I195_2:%.*]] = and i32 [[SHR_I194_2]], 65537 +; CHECK-NEXT: [[MUL_I196_2:%.*]] = mul nuw i32 [[AND_I195_2]], 65535 +; CHECK-NEXT: [[ADD_I197_2:%.*]] = add i32 [[MUL_I196_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I198_2:%.*]] = xor i32 [[ADD_I197_2]], [[MUL_I196_2]] +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I188_2]], [[ADD113_1]] +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I193_2]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I198_2]] +; CHECK-NEXT: [[ADD78_3:%.*]] = add nsw i32 [[SUB59_1]], [[SUB59]] +; CHECK-NEXT: [[SUB86_3:%.*]] = sub nsw i32 [[SUB59]], [[SUB59_1]] +; CHECK-NEXT: [[ADD94_3:%.*]] = add nsw i32 [[SUB59_3]], [[SUB59_2]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 [[SUB59_2]], [[SUB59_3]] +; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 [[ADD94_3]], [[ADD78_3]] +; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 [[ADD78_3]], [[ADD94_3]] +; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], [[SUB86_3]] +; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 [[SUB86_3]], [[SUB102_3]] +; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 +; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 +; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 +; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] +; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] +; CHECK-NEXT: [[SHR_I184_3:%.*]] = lshr i32 [[ADD105_3]], 15 +; CHECK-NEXT: [[AND_I185_3:%.*]] = and i32 [[SHR_I184_3]], 65537 +; CHECK-NEXT: [[MUL_I186_3:%.*]] = mul nuw i32 [[AND_I185_3]], 65535 +; CHECK-NEXT: [[ADD_I187_3:%.*]] = add i32 [[MUL_I186_3]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I188_3:%.*]] = xor i32 [[ADD_I187_3]], [[MUL_I186_3]] +; CHECK-NEXT: [[SHR_I189_3:%.*]] = lshr i32 [[SUB104_3]], 15 +; CHECK-NEXT: [[AND_I190_3:%.*]] = and i32 [[SHR_I189_3]], 65537 +; CHECK-NEXT: [[MUL_I191_3:%.*]] = mul nuw i32 [[AND_I190_3]], 65535 +; CHECK-NEXT: [[ADD_I192_3:%.*]] = add i32 [[MUL_I191_3]], [[SUB104_3]] +; CHECK-NEXT: [[XOR_I193_3:%.*]] = xor i32 [[ADD_I192_3]], [[MUL_I191_3]] +; CHECK-NEXT: [[SHR_I194_3:%.*]] = lshr i32 [[SUB106_3]], 15 +; CHECK-NEXT: [[AND_I195_3:%.*]] = and i32 [[SHR_I194_3]], 65537 +; CHECK-NEXT: [[MUL_I196_3:%.*]] = mul nuw i32 [[AND_I195_3]], 65535 +; CHECK-NEXT: [[ADD_I197_3:%.*]] = add i32 [[MUL_I196_3]], [[SUB106_3]] +; CHECK-NEXT: [[XOR_I198_3:%.*]] = xor i32 [[ADD_I197_3]], [[MUL_I196_3]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I188_3]], [[ADD113_2]] +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I193_3]] +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I198_3]] +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[ADD113_3]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -7,21 +7,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -47,21 +46,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub reassoc <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd reassoc <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -88,21 +86,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -129,21 +126,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -170,21 +166,20 @@ ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul nnan <2 x float> [[SHUFFLE1]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul nnan <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub nnan <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd nnan <2 x float> [[TMP5]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP9]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -267,16 +262,16 @@ ; CHECK-NEXT: [[SUB_I1096:%.*]] = fsub fast float 1.000000e+00, [[TMP0:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP1]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[SHUFFLE1]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x float> [[SHUFFLE1]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[SUB_I1096]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x float> [[SHUFFLE]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x float> [[SHUFFLE]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -18,19 +18,20 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[A]], align 4 +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[LD2:%.*]] = load float, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[MUL0:%.*]] = fmul fast float [[LD0]], [[LD1]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast float [[LD2]], [[LD1]] +; CHECK-NEXT: store float [[MUL0]], ptr [[A]], align 4 +; CHECK-NEXT: store float [[MUL1]], ptr [[GEP1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[GEP2]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -211,15 +211,14 @@ ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP4]], ptr [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -232,23 +232,27 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i64 5 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i64 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[AB4]], i64 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i64 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -232,23 +232,27 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i64 5 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i64 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[AB4]], i64 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i64 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll @@ -607,26 +607,35 @@ ; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A:%.*]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B:%.*]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> ; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] -; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> -; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> -; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> -; SLM-NEXT: ret <8 x double> [[R73]] +; SLM-NEXT: [[C2:%.*]] = fdiv double [[A2]], [[B2]] +; SLM-NEXT: [[C3:%.*]] = fdiv double [[A3]], [[B3]] +; SLM-NEXT: [[C4:%.*]] = fdiv double [[A4]], [[B4]] +; SLM-NEXT: [[C5:%.*]] = fdiv double [[A5]], [[B5]] +; SLM-NEXT: [[C6:%.*]] = fdiv double [[A6]], [[B6]] +; SLM-NEXT: [[C7:%.*]] = fdiv double [[A7]], [[B7]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x double> [[TMP4]], double [[C2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7 +; SLM-NEXT: ret <8 x double> [[R7]] ; ; AVX-LABEL: @buildvector_div_8f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll @@ -607,26 +607,35 @@ ; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A:%.*]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B:%.*]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> ; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] -; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> -; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> -; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> -; SLM-NEXT: ret <8 x double> [[R73]] +; SLM-NEXT: [[C2:%.*]] = fdiv double [[A2]], [[B2]] +; SLM-NEXT: [[C3:%.*]] = fdiv double [[A3]], [[B3]] +; SLM-NEXT: [[C4:%.*]] = fdiv double [[A4]], [[B4]] +; SLM-NEXT: [[C5:%.*]] = fdiv double [[A5]], [[B5]] +; SLM-NEXT: [[C6:%.*]] = fdiv double [[A6]], [[B6]] +; SLM-NEXT: [[C7:%.*]] = fdiv double [[A7]], [[B7]] +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x double> [[TMP4]], double [[C2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7 +; SLM-NEXT: ret <8 x double> [[R7]] ; ; AVX-LABEL: @buildvector_div_8f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -67,7 +67,7 @@ ; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 ; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,13 +16,28 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer -; SSE-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]] -; SSE-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16 +; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] +; SSE-NEXT: store i8 [[TMP1]], ptr @cle, align 16 +; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] +; SSE-NEXT: store i8 [[TMP2]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 1), align 1 +; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] +; SSE-NEXT: store i8 [[TMP3]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 2), align 1 +; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] +; SSE-NEXT: store i8 [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 3), align 1 +; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] +; SSE-NEXT: store i8 [[TMP5]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 4), align 1 +; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] +; SSE-NEXT: store i8 [[TMP6]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 5), align 1 +; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] +; SSE-NEXT: store i8 [[TMP7]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 6), align 1 +; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] +; SSE-NEXT: store i8 [[TMP8]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 7), align 1 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> zeroinitializer +; SSE-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> poison, i8 [[C]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <8 x i32> zeroinitializer +; SSE-NEXT: [[TMP13:%.*]] = xor <8 x i8> [[TMP10]], [[TMP12]] +; SSE-NEXT: store <8 x i8> [[TMP13]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 8), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( @@ -95,11 +110,10 @@ ; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer ; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2 -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; AVX-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]] -; AVX-NEXT: store <4 x i32> [[TMP9]], ptr @cle32, align 16 +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[B:%.*]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP5]], [[TMP7]] +; AVX-NEXT: store <4 x i32> [[TMP8]], ptr @cle32, align 16 ; AVX-NEXT: ret void ; %add1 = add i32 %c, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll @@ -5,23 +5,19 @@ define i1 @foo() { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4 -; CHECK-NEXT: br i1 false, label [[TMP15:%.*]], label [[TMP2:%.*]] +; CHECK-NEXT: br i1 false, label [[TMP11:%.*]], label [[TMP2:%.*]] ; CHECK: 2: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> zeroinitializer, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP7]], <4 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> -; CHECK-NEXT: br label [[TMP15]] -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x float> [ [[TMP14]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP5]], <4 x float> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fsub <4 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP10]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ] ; CHECK-NEXT: ret i1 false ; %1 = load float, ptr null, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 ; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <2 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]] ; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -127,10 +127,10 @@ ; CHECK-LABEL: @dct36( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, ptr [[INBUF:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[INBUF]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[ARRAYIDX44]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[INBUF]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[ARRAYIDX44]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll @@ -23,7 +23,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP2]], 7 ; CHECK-NEXT: store i32 [[AND]], ptr @a, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP0]], i32 0, i32 1 ; CHECK-NEXT: switch i32 [[AND]], label [[IF_END:%.*]] [ ; CHECK-NEXT: i32 7, label [[SAVE_STATE_AND_RETURN]] ; CHECK-NEXT: i32 0, label [[SAVE_STATE_AND_RETURN]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -99,15 +99,7 @@ ; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] ; CHECK: if.then38: ; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double 6.000000e-02, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> , [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> , [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> , [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> , [[TMP6]] -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8 +; CHECK-NEXT: store <2 x double> , ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8 ; CHECK-NEXT: br label [[IF_THEN78]] ; CHECK: if.then78: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -19,14 +19,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], ; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[G]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[G]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP4]], 4.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[ARRAYIDX9]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL11]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[ARRAYIDX9]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -10,11 +10,10 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: @@ -23,10 +22,10 @@ ; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX11]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP2]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[TMP9]], 0 ; CHECK-NEXT: [[OP_RDX9:%.*]] = add i32 [[OP_RDX8]], [[TMP3]] ; CHECK-NEXT: ret i32 [[OP_RDX9]] ; CHECK: bb5: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -9,11 +9,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SHUFFLE]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], ptr @a, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @a, align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -524,37 +524,22 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @foo( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @foo( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 %sub14.i167 = fsub float undef, %vecext.i291.i166 @@ -643,12 +628,11 @@ ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP0]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]] ; SSE-NEXT: ret double [[ADD3]] ; ; AVX-LABEL: @splat_loads( @@ -700,14 +684,13 @@ ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP0]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP6]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] ; SSE-NEXT: ret double [[RES]] ; ; AVX-LABEL: @splat_loads_with_internal_uses( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -15,15 +15,13 @@ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> , i32 [[SUB86_1]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 12 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SUB102_3]], i32 12 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_3]], i32 15 +; CHECK-NEXT: [[TMP9:%.*]] = freeze <16 x i32> [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = lshr <16 x i32> [[TMP12]], ; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -187,9 +187,9 @@ ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[BB16:%.*]], label [[BB6:%.*]] ; CHECK: bb6: ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[ARG1:%.*]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG2:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[ARG2:%.*]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], zeroinitializer ; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP8]], align 4 ; CHECK-NEXT: ret void @@ -226,9 +226,9 @@ ; CHECK-NEXT: [[TMP38:%.*]] = fadd float 0.000000e+00, [[TMP37]] ; CHECK-NEXT: store float [[TMP38]], ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[ARG4]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP39]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> zeroinitializer, [[TMP7]] -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[TMP39]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[TMP39]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> zeroinitializer, [[TMP4]] +; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP39]], align 4 ; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARG3:%.*]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[ARG4]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = fadd float 0.000000e+00, [[TMP45]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -9,19 +9,19 @@ define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) { ; CHECK-LABEL: @shuffle_operands1( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @shuffle_operands1( -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SSE2-NEXT: store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: ret void ; %from_1 = getelementptr double, ptr %from, i64 1 @@ -41,11 +41,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -55,11 +55,11 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -89,11 +89,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -103,11 +103,11 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -137,11 +137,11 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -151,11 +151,11 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -185,10 +185,10 @@ ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: @@ -199,10 +199,10 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: @@ -251,10 +251,10 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: @@ -304,10 +304,10 @@ ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]] +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: @@ -345,7 +345,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 @@ -355,20 +355,20 @@ ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 ; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]] -; CHECK-NEXT: [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]] +; CHECK-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3 +; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]] ; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -380,7 +380,7 @@ ; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 ; SSE2-NEXT: br label [[FOR_BODY3:%.*]] ; SSE2: for.body3: -; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 @@ -390,20 +390,20 @@ ; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 ; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] -; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; SSE2-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 +; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0 +; SSE2-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]] +; SSE2-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; SSE2-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]] -; SSE2-NEXT: [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 -; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] +; SSE2-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]] +; SSE2-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4 +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3 +; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]] ; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 -; SSE2-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 ; SSE2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; SSE2: for.end: ; SSE2-NEXT: ret void @@ -458,17 +458,17 @@ define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){ ; CHECK-LABEL: @load_reorder_double( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @load_reorder_double( -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 -; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SSE2-NEXT: store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4 ; SSE2-NEXT: ret void ; %1 = load double, ptr %a @@ -493,17 +493,17 @@ define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){ ; CHECK-LABEL: @load_reorder_float( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @load_reorder_float( -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 -; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4 ; SSE2-NEXT: ret void ; %1 = load float, ptr %a @@ -542,21 +542,21 @@ define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) { ; CHECK-LABEL: @opcode_reorder( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]] -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @opcode_reorder( -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 -; SSE2-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]] -; SSE2-NEXT: store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 +; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] +; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4 ; SSE2-NEXT: ret void ; %1 = load float, ptr %b diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -26,11 +26,11 @@ ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]] ; CHECK: if.else: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 10 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[IF_ELSE]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[IF_ELSE]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[A]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: @@ -73,19 +73,19 @@ define i32 @foo2(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n, i32 %m) #0 { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4]] = fadd <2 x double> [[TMP3]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_019]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[B:%.*]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret i32 0 ; entry: @@ -138,41 +138,40 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP12]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], -; CHECK-NEXT: [[TMP16]] = fadd <4 x float> [[TMP6]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121 +; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], +; CHECK-NEXT: [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -68,14 +68,13 @@ ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> ; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer ; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP5]], <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP5]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <2 x i64> [[TMP12]], ptr [[ARRAYIDX2_2]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], +; SSE-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> +; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], +; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] +; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( @@ -89,14 +88,13 @@ ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP4]], <2 x i32> -; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], -; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], -; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> -; AVX-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP4]], -; AVX-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] -; AVX-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0 +; AVX-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], +; AVX-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> +; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], +; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]] +; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1 ; AVX-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 @@ -25,20 +25,33 @@ ; ; AVX-LABEL: @foo( ; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX-NEXT: store i32 [[TMP1]], ptr @a, align 16 ; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; AVX-NEXT: store <8 x i32> [[SHUFFLE]], ptr @a, align 16 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4 +; AVX-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4 +; AVX-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4 +; AVX-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4 ; AVX-NEXT: ret void ; +; AVX2-LABEL: @foo( +; AVX2-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX2-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX2-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 +; AVX2-NEXT: ret void +; ; AVX512-LABEL: @foo( ; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 ; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 ; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 ; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: store <8 x i32> [[SHUFFLE]], ptr @a, align 16 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; %1 = load i32, ptr @b, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; FIXME: fix the cost of the xor reduction ops + define i16 @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: @@ -8,19 +10,17 @@ ; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6 ; CHECK-NEXT: br label [[WHILE:%.*]] ; CHECK: while: -; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ] +; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX27:%.*]], [[WHILE]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]] -; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]] -; CHECK-NEXT: [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]] +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr null, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]]) +; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 0, [[TMP2]] +; CHECK-NEXT: [[OP_RDX25:%.*]] = xor i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[OP_RDX26:%.*]] = xor i64 [[OP_RDX24]], [[OP_RDX25]] +; CHECK-NEXT: [[OP_RDX27]] = xor i64 [[OP_RDX26]], [[TMP5]] ; CHECK-NEXT: br label [[WHILE]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -329,20 +329,12 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: @logical_and_icmp_clamp_v8i32( -; CHECK-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 -; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y1]], i32 5 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y2]], i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y3]], i32 7 -; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) -; CHECK-NEXT: ret i1 [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) +; CHECK-NEXT: ret i1 [[TMP5]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -89,7 +89,7 @@ ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 @@ -138,7 +138,7 @@ ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -11,18 +11,18 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> ; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> , <8 x double> poison) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> , <8 x double> poison) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) -; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) +; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 +; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> ) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[LD:%.*]] = load <8 x i16>, ptr undef, align 16 ; CHECK-NEXT: br label [[T:%.*]] ; CHECK: t: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[LD]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[LD]], [[TMP0]] ; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[PTR:%.*]], align 2 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll @@ -14,17 +14,16 @@ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x ptr> , <8 x ptr> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP4]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP13]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP14]], false +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP13]], false ; CHECK-NEXT: ret i1 [[OP_RDX]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> , <8 x float> poison) ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> , <16 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: store <16 x float> [[TMP11]], ptr [[TMP5]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll @@ -17,9 +17,8 @@ ; CHECK-NEXT: [[T13:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: br label [[ELSE1:%.*]] ; CHECK: else1: -; CHECK-NEXT: [[T20:%.*]] = extractelement <2 x i32> [[T13]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[BF_CAST162]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[T20]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[T13]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[BF_CAST162]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt <2 x i32> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: ret i1 [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -12,10 +12,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> , <2 x float> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP0]], float 0.000000e+00, i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> , <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: @@ -23,12 +23,12 @@ ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer +; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -7,13 +7,12 @@ ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]] -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP27]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> undef, <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP27]], align 8 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -13,16 +13,15 @@ ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer -; CHECK-NEXT: store <4 x float> [[TMP13]], ptr [[RESULT]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[SHUFFLE]], [[TMP6]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[SHUFFLE1]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], zeroinitializer +; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[RESULT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: