diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4401,6 +4401,11 @@ if (isa(EU.Scalar->getType())) continue; + // Already counted the cost for external uses when tried to adjust the cost + // for extractelements, no need to add it again. + if (isa(EU.Scalar)) + continue; + // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. if (EU.User && isa(EU.User)) { @@ -5466,7 +5471,14 @@ Value *Lane = Builder.getInt32(ExternalUse.Lane); auto ExtractAndExtendIfNeeded = [&](Value *Vec) { if (Scalar->getType() != Vec->getType()) { - Value *Ex = Builder.CreateExtractElement(Vec, Lane); + Value *Ex; + // "Reuse" the existing extract to improve final codegen. + if (auto *ES = dyn_cast(Scalar)) { + Ex = Builder.CreateExtractElement(ES->getOperand(0), + ES->getOperand(1)); + } else { + Ex = Builder.CreateExtractElement(Vec, Lane); + } // If necessary, sign-extend or zero-extend ScalarRoot // to the larger type. if (!MinBWs.count(ScalarRoot)) @@ -5474,12 +5486,11 @@ if (MinBWs[ScalarRoot].second) return Builder.CreateSExt(Ex, Scalar->getType()); return Builder.CreateZExt(Ex, Scalar->getType()); - } else { - assert(isa(Scalar->getType()) && - isa(Scalar) && - "In-tree scalar of vector type is not insertelement?"); - return Vec; } + assert(isa(Scalar->getType()) && + isa(Scalar) && + "In-tree scalar of vector type is not insertelement?"); + return Vec; }; // If User == nullptr, the Scalar is used as extra arg. Generate // ExtractElement instruction and update the record for this scalar in diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0 ; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 @@ -40,7 +40,7 @@ ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0 ; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 @@ -87,7 +87,7 @@ ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -6,16 +6,17 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1 +; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -114,16 +115,12 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -145,21 +142,14 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP2_11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2_32:%.*]] = shufflevector <4 x i32> [[TMP2_11]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i32> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP2_32:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_32]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -183,25 +173,23 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP9]], [[TMP10]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,16 +6,17 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1 +; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -114,16 +115,12 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -145,21 +142,14 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP2_11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2_32:%.*]] = shufflevector <4 x i32> [[TMP2_11]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i32> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP2_32:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_32]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -183,25 +173,23 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP9]], [[TMP10]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -143,9 +143,9 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 ; CHECK-NEXT: call void @use(double [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP4]]) ; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -74,10 +74,10 @@ define float @f_used_twice_in_tree(<2 x float> %x) { ; CHECK-LABEL: @f_used_twice_in_tree( -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 -; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X1]] -; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i32 0 +; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] ; CHECK-NEXT: ret float [[ADD]] ;