Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -959,5 +959,33 @@ return LT.first * Entry->Cost; } + if (Kind == TTI::SK_Select) { + static const CostTblEntry SelectTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + }; + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = + CostTableLookup(SelectTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + + if (Kind == TTI::SK_PermuteSingleSrc) { + static const CostTblEntry PermuteTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + }; + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = + CostTableLookup(PermuteTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } Index: test/Analysis/CostModel/AArch64/shuffle-select.ll =================================================================== --- test/Analysis/CostModel/AArch64/shuffle-select.ll +++ test/Analysis/CostModel/AArch64/shuffle-select.ll @@ -39,7 +39,7 @@ } ; COST-LABEL: sel.v2i32 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ; CODE-LABEL: sel.v2i32 ; CODE: mov v0.s[1], v1.s[1] define <2 x i32> @sel.v2i32(<2 x i32> %v0, <2 x i32> %v1) { @@ -48,7 +48,7 @@ } ; COST-LABEL: sel.v4i32 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; COST: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; CODE-LABEL: sel.v4i32 ; CODE: rev64 v0.4s, v0.4s ; CODE: trn2 v0.4s, v0.4s, v1.4s @@ -58,7 +58,7 @@ } ; COST-LABEL: sel.v2i64 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> ; CODE-LABEL: sel.v2i64 ; CODE: mov v0.d[1], v1.d[1] define <2 x i64> @sel.v2i64(<2 x i64> %v0, <2 x i64> %v1) { @@ -67,7 +67,7 @@ } ; COST-LABEL: sel.v2f32 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ; CODE-LABEL: sel.v2f32 ; CODE: mov v0.s[1], v1.s[1] define <2 x float> @sel.v2f32(<2 x float> %v0, <2 x float> %v1) { @@ -76,7 +76,7 @@ } ; COST-LABEL: sel.v4f32 -; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; COST: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; CODE-LABEL: sel.v4f32 ; CODE: rev64 v0.4s, v0.4s ; CODE: trn2 v0.4s, v0.4s, v1.4s @@ -86,7 +86,7 @@ } ; COST-LABEL: sel.v2f64 -; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> ; CODE-LABEL: sel.v2f64 ; CODE: mov v0.d[1], v1.d[1] define <2 x double> @sel.v2f64(<2 x double> %v0, <2 x double> %v1) { Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -76,31 +76,22 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1 -; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2 -; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1 -; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2 -; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]] -; CHECK-NEXT: [[TMP0_3:%.*]] = add i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2 -; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] +; CHECK-NEXT: ret <4 x i32> [[TMP11]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -131,20 +122,18 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_0]], i32 2 -; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0