diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8663,6 +8663,11 @@ return TE->isOperandGatherNode({E, NodeIdx}) && VE->isSame(TE->Scalars); }))) { + auto FinalShuffle = [&](Value *V, ArrayRef Mask) { + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleBuilder.add(V, Mask); + return ShuffleBuilder.finalize(std::nullopt); + }; Value *V = vectorizeTree(VE); if (VF != cast(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { @@ -8696,18 +8701,14 @@ assert(VF >= UsedIdxs.size() && "Expected vectorization factor " "less than original vector size."); UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); - V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); + V = FinalShuffle(V, UniqueIdxs); } else { assert(VF < cast(V->getType())->getNumElements() && "Expected vectorization factor less " "than original vector size."); SmallVector UniformMask(VF, 0); std::iota(UniformMask.begin(), UniformMask.end(), 0); - V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); - } - if (auto *I = dyn_cast(V)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); + V = FinalShuffle(V, UniformMask); } } return V; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -5,15 +5,15 @@ ; CHECK-LABEL: @wombat( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], undef -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> undef, <4 x i32> [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]] -; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP27]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]] +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP27]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -58,12 +58,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[SHRINK_SHUFFLE:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], ptr undef, align 2 -; CHECK-NEXT: [[SHRINK_SHUFFLE]] = shufflevector <8 x i16> [[SHUFFLE]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr undef, align 2 ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll @@ -9,23 +9,22 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ADD7:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[TMP0]], -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: switch i32 undef, label [[SW_EPILOG:%.*]] [ ; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[TMP1]], ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[THIS:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: sw.epilog: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> undef, [[SHUFFLE]] -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SHUFFLE1]] -; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> undef, [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: