diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -540,6 +540,114 @@ return *EI->idx_begin(); } +/// Tries to find extractelement instructions with constant indices from fixed +/// vector type and gather such instructions into a bunch, which highly likely +/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was +/// successful, the matched scalars are replaced by poison values in \p VL for +/// future analysis. +static std::optional +tryToGatherExtractElements(SmallVectorImpl &VL, + SmallVectorImpl &Mask) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MapVector> VectorOpToIdx; + SmallVector UndefVectorExtracts; + for (int I = 0, E = VL.size(); I < E; ++I) { + auto *EI = dyn_cast(VL[I]); + if (!EI) + continue; + auto *VecTy = dyn_cast(EI->getVectorOperandType()); + if (!VecTy || !isa(EI->getIndexOperand())) + continue; + std::optional Idx = getExtractIndex(EI); + // Undefined index. + if (!Idx) { + UndefVectorExtracts.push_back(I); + continue; + } + SmallBitVector ExtractMask(VecTy->getNumElements(), true); + ExtractMask.reset(*Idx); + if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + UndefVectorExtracts.push_back(I); + continue; + } + VectorOpToIdx[EI->getVectorOperand()].push_back(I); + } + // Sort the vector operands by the maximum number of uses in extractelements. + MapVector> VFToVector; + for (const auto &Data : VectorOpToIdx) + VFToVector[cast(Data.first->getType())->getNumElements()] + .push_back(Data.first); + for (auto &Data : VFToVector) { + stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { + return VectorOpToIdx.find(V1)->second.size() > + VectorOpToIdx.find(V2)->second.size(); + }); + } + // Find the best pair of the vectors with the same number of elements or a + // single vector. + const int UndefSz = UndefVectorExtracts.size(); + unsigned SingleMax = 0; + Value *SingleVec = nullptr; + unsigned PairMax = 0; + std::pair PairVec(nullptr, nullptr); + for (auto &Data : VFToVector) { + Value *V1 = Data.second.front(); + if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { + SingleMax = VectorOpToIdx[V1].size() + UndefSz; + SingleVec = V1; + } + Value *V2 = nullptr; + if (Data.second.size() > 1) + V2 = *std::next(Data.second.begin()); + if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + + UndefSz) { + PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; + PairVec = std::make_pair(V1, V2); + } + } + if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) + return std::nullopt; + // Check if better to perform a shuffle of 2 vectors or just of a single + // vector. + SmallVector SavedVL(VL.begin(), VL.end()); + SmallVector GatheredExtracts( + VL.size(), PoisonValue::get(VL.front()->getType())); + if (SingleMax >= PairMax && SingleMax) { + for (int Idx : VectorOpToIdx[SingleVec]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } else { + for (Value *V : {PairVec.first, PairVec.second}) + for (int Idx : VectorOpToIdx[V]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } + // Add extracts from undefs too. + for (int Idx : UndefVectorExtracts) + std::swap(GatheredExtracts[Idx], VL[Idx]); + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + std::optional Res = + isFixedVectorShuffle(GatheredExtracts, Mask); + if (!Res) { + // TODO: try to check other subsets if possible. + // Restore the original VL if attempt was not successful. + VL.swap(SavedVL); + return std::nullopt; + } + // Restore unused scalars from mask, if some of the extractelements were not + // selected for shuffle. + for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { + auto *EI = dyn_cast(VL[I]); + if (!EI || !isa(EI->getVectorOperandType()) || + !isa(EI->getIndexOperand()) || + is_contained(UndefVectorExtracts, I)) + continue; + if (Mask[I] == UndefMaskElem) + std::swap(VL[I], GatheredExtracts[I]); + } + return Res; +} + namespace { /// Main data required for vectorization of instructions. @@ -6622,15 +6730,19 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo *TTI = this->TTI; - auto AdjustExtractsCost = [=](InstructionCost &Cost) { + auto AdjustExtractsCost = [=](InstructionCost &Cost, + ArrayRef Mask) -> Value * { + if (Mask.empty()) + return nullptr; + Value *VecBase = nullptr; // If the resulting type is scalarized, do not adjust the cost. unsigned VecNumParts = TTI->getNumberOfParts(VecTy); if (VecNumParts == VecTy->getNumElements()) - return; + return nullptr; DenseMap ExtractVectorsTys; SmallPtrSet CheckedExtracts; - for (auto *V : VL) { - if (isa(V)) + for (auto [I, V] : enumerate(VL)) { + if (isa(V) || (!Mask.empty() && Mask[I] == UndefMaskElem)) continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -6644,6 +6756,7 @@ (VE && VE != E)) continue; auto *EE = cast(V); + VecBase = EE->getVectorOperand(); std::optional EEIdx = getExtractIndex(EE); if (!EEIdx) continue; @@ -6683,6 +6796,8 @@ if (TTI->getNumberOfParts(EEVTy) > VecNumParts) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); + if (Idx % NumElts == 0) + continue; if (Idx + NumElts <= EENumElts) { Cost += TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, @@ -6702,12 +6817,16 @@ VecTy, std::nullopt, CostKind, 0, EEVTy); } } + return VecBase; }; if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); + unsigned VF = E->getVectorFactor(); + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Build a mask out of the reorder indices and reorder scalars per this // mask. @@ -6716,24 +6835,51 @@ if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); SmallVector Mask; + SmallVector ExtractMask; + std::optional ExtractShuffle; std::optional GatherShuffle; SmallVector Entries; + Type *ScalarTy = GatheredScalars.front()->getType(); + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + + InstructionCost Cost = 0; + bool Resized = false; + if (Value *VecBase = AdjustExtractsCost(Cost, ExtractMask)) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely // not profitable (splats and small gather nodes.) - if (E->getOpcode() != Instruction::Load || E->isAltShuffle() || + if (ExtractShuffle || E->getOpcode() != Instruction::Load || + E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); if (GatherShuffle) { + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } // Remove shuffled elements from list of gathers. for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { if (Mask[I] != UndefMaskElem) GatheredScalars[I] = PoisonValue::get(ScalarTy); } - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); InstructionCost GatherCost = 0; int Limit = Mask.size() * 2; if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && @@ -6762,29 +6908,19 @@ GatherCost += getGatherCost(GatheredScalars); return GatherCost; } - if ((E->getOpcode() == Instruction::ExtractElement || - all_of(E->Scalars, - [](Value *V) { - return isa(V); - })) && - allSameType(VL)) { + if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. - SmallVector Mask; - std::optional ShuffleKind = - isFixedVectorShuffle(VL, Mask); - if (ShuffleKind) { - // Found the bunch of extractelement instructions that must be gathered - // into a vector and can be represented as a permutation elements in a - // single input vector or of 2 input vectors. - InstructionCost Cost = - computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost); - if (NeedToShuffleReuses) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - return Cost; - } + // Found the bunch of extractelement instructions that must be gathered + // into a vector and can be represented as a permutation elements in a + // single input vector or of 2 input vectors. + InstructionCost Cost = + computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); + AdjustExtractsCost(Cost, ExtractMask); + if (NeedToShuffleReuses) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); + return Cost; } if (isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as the @@ -9192,6 +9328,56 @@ } if (E->getMainOp()) setInsertPointAfterBundle(E); + unsigned VF = E->getVectorFactor(); + auto AdjustExtracts = [&](const TreeEntry *E, ArrayRef Mask) { + Value *VecBase = nullptr; + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + int Idx = Mask[I]; + if (Idx == UndefMaskElem) + continue; + auto *EI = cast(E->Scalars[I]); + VecBase = EI->getVectorOperand(); + // If all users are vectorized - can delete the extractelement + // itself. + if (any_of(EI->users(), + [&](User *U) { return !ScalarToTreeEntry.count(U); })) + continue; + eraseInstruction(EI); + } + return VecBase; + }; + auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef Mask) { + if (V1->getType() != V2->getType()) { + unsigned VF1 = cast(V1->getType())->getNumElements(); + unsigned VF2 = cast(V2->getType())->getNumElements(); + unsigned VF = std::max(VF1, VF2); + SmallVector ExtMask(VF, UndefMaskElem); + std::iota(ExtMask.begin(), + std::next(ExtMask.begin(), std::min(VF1, VF2)), 0); + if (VF1 < VF2) { + V1 = Builder.CreateShuffleVector(V1, ExtMask); + if (auto *I = dyn_cast(V1)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } else { + V2 = Builder.CreateShuffleVector(V2, ExtMask); + if (auto *I = dyn_cast(V2)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } + } + Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + }; + + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Build a mask out of the reorder indices and reorder scalars per this // mask. @@ -9199,19 +9385,94 @@ inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); - Value *Vec; + Value *Vec = nullptr; SmallVector Mask; + SmallVector ExtractMask; + std::optional ExtractShuffle; + std::optional GatherShuffle; SmallVector Entries; - std::optional Shuffle = - isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (Shuffle) { + Type *ScalarTy = GatheredScalars.front()->getType(); + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + bool Resized = false; + if (Value *VecBase = AdjustExtracts(E, ExtractMask)) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Gather extracts after we check for full matched gathers only. + if (ExtractShuffle || E->getOpcode() != Instruction::Load || + E->isAltShuffle() || + all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || + isSplat(E->Scalars) || + (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (GatherShuffle) { assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + } + if ((ExtractShuffle || GatherShuffle) && + all_of(GatheredScalars, PoisonValue::classof)) { + Value *Vec1 = nullptr; + if (ExtractShuffle) { + // Gather of extractelements can be represented as just a shuffle of + // a single/two vectors the scalars are extracted from. + // Find input vectors. + Value *Vec2 = nullptr; + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == UndefMaskElem || + (!Mask.empty() && Mask[I] != UndefMaskElem)) { + ExtractMask[I] = UndefMaskElem; + continue; + } + if (isa(E->Scalars[I])) + continue; + auto *EI = cast(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } + } + if (Vec2) + Vec1 = CreateShuffle(Vec1, Vec2, ExtractMask); + else if (Vec1) + Vec1 = CreateShuffle(Vec1, Vec1, ExtractMask); + } + if (GatherShuffle) { + Vec = CreateShuffle(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + if (Vec1) { + // Build final mask. + for (auto [I, Idx] : enumerate(Mask)) { + if (ExtractMask[I] != UndefMaskElem) + Idx = I; + else if (Idx != UndefMaskElem) + Idx = I + VF; + } + Vec = CreateShuffle(Vec1, Vec, Mask); + } + } else { + Vec = Vec1; } } else { Vec = gather(E->Scalars); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll @@ -110,18 +110,15 @@ define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -141,18 +138,15 @@ ; CHECK-LABEL: @k_bb( ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -110,18 +110,15 @@ define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -141,18 +138,15 @@ ; CHECK-LABEL: @k_bb( ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -18,21 +18,21 @@ ; SSE-LABEL: @splat( ; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 ; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> -; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 -; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @cle, align 16 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer +; SSE-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( ; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 -; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; AVX-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] -; AVX-NEXT: store <16 x i8> [[TMP4]], ptr @cle, align 16 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; AVX-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]] +; AVX-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16 ; AVX-NEXT: ret void ; %1 = xor i8 %c, %a @@ -91,15 +91,15 @@ ; ; AVX-LABEL: @same_opcode_on_one_side( ; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 -; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2 -; AVX-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[SHUFFLE2]] -; AVX-NEXT: store <4 x i32> [[TMP6]], ptr @cle32, align 16 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2 +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]] +; AVX-NEXT: store <4 x i32> [[TMP9]], ptr @cle32, align 16 ; AVX-NEXT: ret void ; %add1 = add i32 %c, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -146,16 +146,14 @@ ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -180,16 +180,14 @@ ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -18,41 +18,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; SSE2-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; SSE2-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; SSE2-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; SSE2-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; SSE2-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0 -; SSE2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4 -; SSE2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5 -; SSE2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6 -; SSE2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7 -; SSE2-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; SSE2-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; SSE2-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; SSE2-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; SSE2-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; SSE2-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; SSE2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1 -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2 -; SSE2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3 -; SSE2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4 -; SSE2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5 -; SSE2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6 -; SSE2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7 -; SSE2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]]) -; SSE2-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]] +; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -70,41 +40,11 @@ ; ; AVX-LABEL: @reduce_and4( ; AVX-NEXT: entry: -; AVX-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; AVX-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; AVX-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; AVX-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; AVX-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; AVX-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; AVX-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; AVX-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; AVX-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7 -; AVX-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; AVX-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; AVX-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; AVX-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; AVX-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; AVX-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; AVX-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; AVX-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7 -; AVX-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]]) -; AVX-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]] +; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; @@ -154,41 +94,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4_transpose( -; SSE2-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; SSE2-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; SSE2-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; SSE2-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; SSE2-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; SSE2-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; SSE2-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; SSE2-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; SSE2-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3 -; SSE2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4 -; SSE2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5 -; SSE2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6 -; SSE2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7 -; SSE2-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; SSE2-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0 -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1 -; SSE2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2 -; SSE2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3 -; SSE2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4 -; SSE2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5 -; SSE2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6 -; SSE2-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7 -; SSE2-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]]) -; SSE2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]] +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -204,41 +114,11 @@ ; SSE42-NEXT: ret i32 [[OP_RDX3]] ; ; AVX-LABEL: @reduce_and4_transpose( -; AVX-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; AVX-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; AVX-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; AVX-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; AVX-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; AVX-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; AVX-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; AVX-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; AVX-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; AVX-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; AVX-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; AVX-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; AVX-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; AVX-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7 -; AVX-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; AVX-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6 -; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7 -; AVX-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]]) -; AVX-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]] +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -18,7 +18,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-20' +; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4'