diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3613,28 +3613,37 @@ TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *EI = cast(VL[I]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(EI)) { - // Take credit for instruction that will become dead. - if (EI->hasOneUse()) { - Instruction *Ext = EI->user_back(); - if ((isa(Ext) || isa(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. + auto *EI = cast(VL[I]); + // Take credit for instruction that will become dead. + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(EI); + CommonCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(), + *getExtractIndex(EE)); + } else { CommonCost -= TTI->getExtractWithExtendCost( Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; } + // Add back the cost of s|zext which is subtracted separately. + CommonCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EI->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; } + } + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(EI); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); + } else { CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -145,22 +145,21 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 +; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -145,22 +145,21 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 +; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -46,13 +46,13 @@ ; CHECK-NEXT: ret float [[X0]] ; ; THRESH1-LABEL: @f_used_out_of_tree( -; THRESH1-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; THRESH1-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 -; THRESH1-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X0]] -; THRESH1-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] +; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 +; THRESH1-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]] +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH1-NEXT: store float [[ADD]], float* @a, align 4 -; THRESH1-NEXT: ret float [[X0]] +; THRESH1-NEXT: ret float [[TMP1]] ; ; THRESH2-LABEL: @f_used_out_of_tree( ; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0