diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8663,6 +8663,11 @@ return TE->isOperandGatherNode({E, NodeIdx}) && VE->isSame(TE->Scalars); }))) { + auto FinalShuffle = [&](Value *V, ArrayRef Mask) { + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleBuilder.add(V, Mask); + return ShuffleBuilder.finalize(std::nullopt); + }; Value *V = vectorizeTree(VE); if (VF != cast(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { @@ -8696,18 +8701,14 @@ assert(VF >= UsedIdxs.size() && "Expected vectorization factor " "less than original vector size."); UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); - V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); + V = FinalShuffle(V, UniqueIdxs); } else { assert(VF < cast(V->getType())->getNumElements() && "Expected vectorization factor less " "than original vector size."); SmallVector UniformMask(VF, 0); std::iota(UniformMask.begin(), UniformMask.end(), 0); - V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); - } - if (auto *I = dyn_cast(V)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); + V = FinalShuffle(V, UniformMask); } } return V; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -8,15 +8,15 @@ ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], undef -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> undef, <4 x i32> [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP2]], undef +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> undef, <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -62,12 +62,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[SHRINK_SHUFFLE:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* undef, align 2 -; CHECK-NEXT: [[SHRINK_SHUFFLE]] = shufflevector <8 x i16> [[SHUFFLE]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: store <8 x i16> [[TMP1]], <8 x i16>* undef, align 2 ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll @@ -11,25 +11,24 @@ ; CHECK-NEXT: [[G:%.*]] = getelementptr inbounds [[CLASS_E:%.*]], %class.e* [[THIS:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ADD7:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[TMP0]], -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: switch i32 undef, label [[SW_EPILOG:%.*]] [ ; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[G]] to <2 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[G]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP5]], [[TMP3]] ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: sw.epilog: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ] -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> undef, [[SHUFFLE]] -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP6]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> undef, [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: ret void ; entry: