diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5894,15 +5894,29 @@ std::iota(Mask.begin(), Mask.end(), 0); return TargetTransformInfo::SK_PermuteSingleSrc; } - // No perfect match, just shuffle, so choose the first tree node. - Entries.push_back(*UsedTEs.front().begin()); + // No perfect match, just shuffle, so choose the tree node with min index. + Entries.push_back( + *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(), + [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + })); + VF = Entries.back()->getVectorFactor(); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); DenseMap VFToTE; - for (const TreeEntry *TE : UsedTEs.front()) - VFToTE.try_emplace(TE->getVectorFactor(), TE); - for (const TreeEntry *TE : UsedTEs.back()) { + for (const TreeEntry *TE : UsedTEs.front()) { + auto Res = VFToTE.try_emplace(TE->getVectorFactor(), TE); + if (!Res.second && TE->Idx > Res.first->second->Idx) + Res.first->getSecond() = TE; + } + SmallVector UsedTEsBack(UsedTEs.back().begin(), + UsedTEs.back().end()); + // Sort the elements of UsedTEsBack by the idx to keep the determinism. + sort(UsedTEsBack, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx > TE2->Idx; + }); + for (const TreeEntry *TE : UsedTEsBack) { auto It = VFToTE.find(TE->getVectorFactor()); if (It != VFToTE.end()) { VF = It->first; @@ -5911,10 +5925,22 @@ break; } } - // No 2 source vectors with the same vector factor - give up and do regular - // gather. - if (Entries.empty()) - return None; + // No 2 source vectors with the same vector factor - just choose 2 with max + // index. + if (Entries.empty()) { + Entries.push_back( + *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(), + [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + })); + Entries.push_back( + *std::max_element(UsedTEs.back().begin(), UsedTEs.back().end(), + [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + })); + VF = std::max(Entries.front()->getVectorFactor(), + Entries.back()->getVectorFactor()); + } } // Build a shuffle mask for better cost estimation and vector emission. @@ -6322,8 +6348,51 @@ if (Shuffle.hasValue()) { assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); + Value *Vec1 = Entries.front()->VectorizedValue; + Value *Vec2 = Entries.back()->VectorizedValue; + // Resize smaller vector if the types are not equal. + if (Vec1->getType() != Vec2->getType()) { + auto *FixedTy1 = dyn_cast(Vec1->getType()); + auto *FixedTy2 = dyn_cast(Vec2->getType()); + if (!FixedTy1 || !FixedTy2) { + // Not a fixed vector - resize both vectors to the size of the + // resulting vector. + SmallVector ResizeMask(Mask.size(), UndefMaskElem); + std::iota(ResizeMask.begin(), ResizeMask.end(), 0); + Vec1 = Builder.CreateShuffleVector(Vec1, ResizeMask); + if (auto *I = dyn_cast(Vec1)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + Vec2 = Builder.CreateShuffleVector(Vec2, ResizeMask); + if (auto *I = dyn_cast(Vec1)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } else { + // Resize smallest vector to the size of the greatest one. + unsigned VF1 = FixedTy1->getNumElements(); + unsigned VF2 = FixedTy2->getNumElements(); + SmallVector ResizeMask(std::max(VF1, VF2), UndefMaskElem); + std::iota(ResizeMask.begin(), + std::next(ResizeMask.begin(), std::min(VF1, VF2)), 0); + if (cast(Vec1->getType())->getNumElements() < + cast(Vec2->getType())->getNumElements()) { + Vec1 = Builder.CreateShuffleVector(Vec1, ResizeMask); + if (auto *I = dyn_cast(Vec1)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } else { + Vec2 = Builder.CreateShuffleVector(Vec2, ResizeMask); + if (auto *I = dyn_cast(Vec2)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } + } + } + Vec = Builder.CreateShuffleVector(Vec1, Vec2, Mask); if (auto *I = dyn_cast(Vec)) { GatherShuffleSeq.insert(I); CSEBlocks.insert(I->getParent()); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffled-gathers-diff-size.ll @@ -4,37 +4,36 @@ define void @foo(i32* noalias nocapture writeonly %B, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %C, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[N:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; CHECK-NEXT: [[MUL2:%.*]] = mul nsw i32 [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL2]], [[MUL]] -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[MUL4:%.*]] = mul nsw i32 [[ADD]], [[TMP2]] -; CHECK-NEXT: store i32 [[MUL4]], i32* [[B:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], [[M]] -; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[MUL9]], [[MUL]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[MUL12:%.*]] = mul nsw i32 [[ADD10]], [[TMP4]] -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; CHECK-NEXT: store i32 [[MUL12]], i32* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP2]], [[N]] -; CHECK-NEXT: [[MUL17:%.*]] = mul nsw i32 [[TMP4]], [[M]] -; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL15]] -; CHECK-NEXT: [[MUL20:%.*]] = mul nsw i32 [[ADD18]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[N:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[N]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP5]], [[M]] -; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL25]], [[MUL15]] -; CHECK-NEXT: [[MUL28:%.*]] = mul nsw i32 [[ADD26]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[M:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP6]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP16]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP17]] ; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: store i32 [[MUL28]], i32* [[ARRAYIDX29]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4 ; CHECK-NEXT: ret void ; entry: