diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -772,6 +772,31 @@ Scalars[Mask[I]] = Prev[I]; } +/// \returns shuffle mask if \p Insert is not the last of \p Inserts +static Optional> +getMaskIfIntermediateInsert(Value *Insert, ArrayRef Inserts) { + auto *LastInsert = cast(*find_if(Inserts, [Inserts](Value *V) { + return all_of(Inserts, [V](Value *VV) { + return V != cast(VV)->getOperand(0); + }); + })); + if (Insert != LastInsert) { + const unsigned NumElts = + cast(Insert->getType())->getNumElements(); + SmallVector Mask(NumElts); + std::iota(Mask.begin(), Mask.end(), 0); + + Value *CurrentInsert = LastInsert; + while (CurrentInsert != Insert) { + unsigned InsertIdx = *getInsertIndex(CurrentInsert); + Mask[InsertIdx] = InsertIdx + NumElts; + CurrentInsert = cast(CurrentInsert)->getOperand(0); + } + return Mask; + } + return None; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -5907,9 +5932,18 @@ if (EphValues.count(EU.User)) continue; - // No extract cost for vector "scalar" - if (isa(EU.Scalar->getType())) + // Add extract cost for vector "scalar", if it's not the last insert + if (isa(EU.Scalar->getType())) { + TreeEntry *E = getTreeEntry(EU.Scalar); + assert(E && "Invalid scalar"); + if (auto Mask = getMaskIfIntermediateInsert(EU.Scalar, E->Scalars)) { + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteTwoSrc, cast(EU.Scalar->getType()), + *Mask); + ExtractCost += C; + } continue; + } // Already counted the cost for external uses when tried to adjust the cost // for extractelements, no need to add it again. @@ -7240,6 +7274,18 @@ assert(isa(Scalar->getType()) && isa(Scalar) && "In-tree scalar of vector type is not insertelement?"); + // For insertelements, if Scalar is not the last insert, + // then shuffle vectorized value with original vector + if (auto Mask = getMaskIfIntermediateInsert(Scalar, E->Scalars)) { + auto *FirstInsert = + cast(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, + cast(V)->getOperand(0)); + })); + Vec = + Builder.CreateShuffleVector(Vec, FirstInsert->getOperand(0), *Mask); + } + return Vec; }; // If User == nullptr, the Scalar is used as extra arg. Generate diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr52275.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr52275.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr52275.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr52275.ll @@ -4,12 +4,12 @@ define <4 x i8> @test(<4 x i8> %v, i8* %x) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[X]] to <2 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[V11:%.*]] = shufflevector <4 x i8> [[V:%.*]], <4 x i8> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[V2:%.*]] = add <4 x i8> [[V11]], [[V11]] +; CHECK-NEXT: [[X0:%.*]] = load i8, i8* [[X:%.*]], align 4 +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, i8* [[X]], i64 1 +; CHECK-NEXT: [[X1:%.*]] = load i8, i8* [[G1]], align 4 +; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i8> [[V:%.*]], i8 [[X0]], i64 0 +; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i64 1 +; CHECK-NEXT: [[V2:%.*]] = add <4 x i8> [[V0]], [[V1]] ; CHECK-NEXT: ret <4 x i8> [[V2]] ; ; FORCE_SLP-LABEL: @test( @@ -18,7 +18,8 @@ ; FORCE_SLP-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 4 ; FORCE_SLP-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <4 x i32> ; FORCE_SLP-NEXT: [[V11:%.*]] = shufflevector <4 x i8> [[V:%.*]], <4 x i8> [[TMP3]], <4 x i32> -; FORCE_SLP-NEXT: [[V2:%.*]] = add <4 x i8> [[V11]], [[V11]] +; FORCE_SLP-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[V11]], <4 x i8> [[V]], <4 x i32> +; FORCE_SLP-NEXT: [[V2:%.*]] = add <4 x i8> [[TMP4]], [[V11]] ; FORCE_SLP-NEXT: ret <4 x i8> [[V2]] ; %x0 = load i8, i8* %x, align 4 @@ -36,7 +37,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[T1]] to <2 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> -; CHECK-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> [[T6:%.*]], <2 x i32> +; CHECK-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP3]], [[TMP4]] ; CHECK-NEXT: ret <2 x i8> [[T11]] ; ; FORCE_SLP-LABEL: @test2( @@ -44,7 +46,8 @@ ; FORCE_SLP-NEXT: [[TMP1:%.*]] = bitcast i32* [[T1]] to <2 x i32>* ; FORCE_SLP-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; FORCE_SLP-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> -; FORCE_SLP-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP3]], [[TMP3]] +; FORCE_SLP-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> [[T6:%.*]], <2 x i32> +; FORCE_SLP-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP3]], [[TMP4]] ; FORCE_SLP-NEXT: ret <2 x i8> [[T11]] ; %t3 = load i32, i32* %t1, align 4 @@ -60,12 +63,14 @@ define <2 x i8> @test_reorder(<2 x i8> %t6, i32* %t1) { ; CHECK-LABEL: @test_reorder( -; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[T1]] to <2 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[T1:%.*]], align 4 +; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 1 +; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[T4]], align 4 +; CHECK-NEXT: [[T7:%.*]] = trunc i32 [[T3]] to i8 +; CHECK-NEXT: [[T8:%.*]] = insertelement <2 x i8> [[T6:%.*]], i8 [[T7]], i64 1 +; CHECK-NEXT: [[T9:%.*]] = trunc i32 [[T5]] to i8 +; CHECK-NEXT: [[T10:%.*]] = insertelement <2 x i8> [[T8]], i8 [[T9]], i64 0 +; CHECK-NEXT: [[T11:%.*]] = add <2 x i8> [[T10]], [[T8]] ; CHECK-NEXT: ret <2 x i8> [[T11]] ; ; FORCE_SLP-LABEL: @test_reorder( @@ -74,7 +79,8 @@ ; FORCE_SLP-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; FORCE_SLP-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> ; FORCE_SLP-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> -; FORCE_SLP-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP4]], [[TMP4]] +; FORCE_SLP-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> [[T6:%.*]], <2 x i32> +; FORCE_SLP-NEXT: [[T11:%.*]] = add <2 x i8> [[TMP4]], [[TMP5]] ; FORCE_SLP-NEXT: ret <2 x i8> [[T11]] ; %t3 = load i32, i32* %t1, align 4 @@ -96,7 +102,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> ; CHECK-NEXT: [[T101:%.*]] = shufflevector <4 x i8> [[T6:%.*]], <4 x i8> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[T11:%.*]] = add <4 x i8> [[T101]], [[T101]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[T101]], <4 x i8> [[T6]], <4 x i32> +; CHECK-NEXT: [[T11:%.*]] = add <4 x i8> [[T101]], [[TMP5]] ; CHECK-NEXT: ret <4 x i8> [[T11]] ; ; FORCE_SLP-LABEL: @test_subvector( @@ -106,7 +113,8 @@ ; FORCE_SLP-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> ; FORCE_SLP-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> ; FORCE_SLP-NEXT: [[T101:%.*]] = shufflevector <4 x i8> [[T6:%.*]], <4 x i8> [[TMP4]], <4 x i32> -; FORCE_SLP-NEXT: [[T11:%.*]] = add <4 x i8> [[T101]], [[T101]] +; FORCE_SLP-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[T101]], <4 x i8> [[T6]], <4 x i32> +; FORCE_SLP-NEXT: [[T11:%.*]] = add <4 x i8> [[T101]], [[TMP5]] ; FORCE_SLP-NEXT: ret <4 x i8> [[T11]] ; %t3 = load i32, i32* %t1, align 4 @@ -122,13 +130,14 @@ define <4 x i8> @test_subvector_reorder(<4 x i8> %t6, i32* %t1) { ; CHECK-LABEL: @test_subvector_reorder( -; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[T1]] to <2 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[T81:%.*]] = shufflevector <4 x i8> [[T6:%.*]], <4 x i8> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[T11:%.*]] = add <4 x i8> [[T81]], [[T81]] +; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[T1:%.*]], align 4 +; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 1 +; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[T4]], align 4 +; CHECK-NEXT: [[T7:%.*]] = trunc i32 [[T3]] to i8 +; CHECK-NEXT: [[T8:%.*]] = insertelement <4 x i8> [[T6:%.*]], i8 [[T7]], i64 3 +; CHECK-NEXT: [[T9:%.*]] = trunc i32 [[T5]] to i8 +; CHECK-NEXT: [[T10:%.*]] = insertelement <4 x i8> [[T8]], i8 [[T9]], i64 2 +; CHECK-NEXT: [[T11:%.*]] = add <4 x i8> [[T10]], [[T8]] ; CHECK-NEXT: ret <4 x i8> [[T11]] ; ; FORCE_SLP-LABEL: @test_subvector_reorder( @@ -138,7 +147,8 @@ ; FORCE_SLP-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> ; FORCE_SLP-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> ; FORCE_SLP-NEXT: [[T81:%.*]] = shufflevector <4 x i8> [[T6:%.*]], <4 x i8> [[TMP4]], <4 x i32> -; FORCE_SLP-NEXT: [[T11:%.*]] = add <4 x i8> [[T81]], [[T81]] +; FORCE_SLP-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[T81]], <4 x i8> [[T6]], <4 x i32> +; FORCE_SLP-NEXT: [[T11:%.*]] = add <4 x i8> [[T81]], [[TMP5]] ; FORCE_SLP-NEXT: ret <4 x i8> [[T11]] ; %t3 = load i32, i32* %t1, align 4