diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3589,13 +3589,24 @@ for (const auto &Data : ExtractVectorsTys) { auto *EEVTy = cast(Data.first->getType()); unsigned NumElts = VecTy->getNumElements(); - if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) - Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, - (Data.second / NumElts) * NumElts, VecTy); - else + if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { + unsigned Idx = (Data.second / NumElts) * NumElts; + unsigned EENumElts = EEVTy->getNumElements(); + if (Idx + NumElts <= EENumElts) { + Cost += + TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, None, Idx, VecTy); + } else { + auto *SubVT = + FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); + Cost += + TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, None, Idx, SubVT); + } + } else { Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, VecTy, None, 0, EEVTy); + } } }; if (E->State == TreeEntry::NeedToGather) { diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -223,52 +223,25 @@ ret <2 x i32> %ins.1 } -define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v3i16( +define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) { +; GFX7-LABEL: @uadd_sat_v9i16_combine_vi16( ; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GFX7-NEXT: [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8 +; GFX7-NEXT: [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 8 ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX7-NEXT: ret <3 x i16> [[INS_2]] -; -; GFX8-LABEL: @uadd_sat_v3i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 -; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP3]], i64 0 -; GFX8-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 -; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1 -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX8-NEXT: ret <3 x i16> [[INS_2]] +; GFX7-NEXT: [[INS_2:%.*]] = insertelement <2 x i16> , i16 [[ADD_2]], i64 1 +; GFX7-NEXT: ret <2 x i16> [[INS_2]] ; bb: - %arg0.0 = extractelement <3 x i16> %arg0, i64 0 - %arg0.1 = extractelement <3 x i16> %arg0, i64 1 - %arg0.2 = extractelement <3 x i16> %arg0, i64 2 - %arg1.0 = extractelement <3 x i16> %arg1, i64 0 - %arg1.1 = extractelement <3 x i16> %arg1, i64 1 - %arg1.2 = extractelement <3 x i16> %arg1, i64 2 - %add.0 = call i16 @llvm.uadd.sat.i16(i16 %arg0.0, i16 %arg1.0) + %arg0.1 = extractelement <9 x i16> undef, i64 7 + %arg0.2 = extractelement <9 x i16> %arg0, i64 8 + %arg1.1 = extractelement <9 x i16> %arg1, i64 7 + %arg1.2 = extractelement <9 x i16> %arg1, i64 8 %add.1 = call i16 @llvm.uadd.sat.i16(i16 %arg0.1, i16 %arg1.1) %add.2 = call i16 @llvm.uadd.sat.i16(i16 %arg0.2, i16 %arg1.2) - %ins.0 = insertelement <3 x i16> undef, i16 %add.0, i64 0 - %ins.1 = insertelement <3 x i16> %ins.0, i16 %add.1, i64 1 - %ins.2 = insertelement <3 x i16> %ins.1, i16 %add.2, i64 2 - ret <3 x i16> %ins.2 + %ins.1 = insertelement <2 x i16> undef, i16 %add.1, i64 0 + %ins.2 = insertelement <2 x i16> %ins.1, i16 %add.2, i64 1 + ret <2 x i16> %ins.2 } define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {