diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3543,6 +3543,59 @@ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, E->ReuseShuffleIndices); } + auto &&AdjustExtractsCost = [this, CostKind, VL, VecTy](InstructionCost &Cost, + bool IsGather) { + DenseMap ExtractVectorsTys; + for (auto *V : VL) { + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + if (IsGather && (!areAllUsersVectorized(cast(V)) || + ScalarToTreeEntry.count(V))) + continue; + auto *EE = cast(V); + unsigned Idx = *getExtractIndex(EE); + if (TTI->getNumberOfParts(VecTy) != + TTI->getNumberOfParts(EE->getVectorOperandType())) { + auto It = + ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; + It->getSecond() = std::min(It->second, Idx); + } + // Take credit for instruction that will become dead. + if (EE->hasOneUse()) { + Instruction *Ext = EE->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + Cost -= + TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); + // Add back the cost of s|zext which is subtracted separately. + Cost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EE->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; + } + } + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), Idx); + } + // Add a cost for subvector extracts/inserts if required. + for (const auto &Data : ExtractVectorsTys) { + auto *EEVTy = cast(Data.first->getType()); + unsigned NumElts = VecTy->getNumElements(); + if (TTI->getNumberOfParts(EEVTy) > TTI->getNumberOfParts(VecTy)) + Cost += + TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, EEVTy, + None, (Data.second / NumElts) * NumElts, VecTy); + else + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + VecTy, None, 0, EEVTy); + } + }; if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; @@ -3559,19 +3612,7 @@ if (ShuffleKind.hasValue()) { InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - for (auto *V : VL) { - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); - } - } + AdjustExtractsCost(Cost, /*IsGather=*/true); return ReuseShuffleCost + Cost; } } @@ -3617,11 +3658,10 @@ unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(VL[I])->getIndexOperand()); - Idx = IO->getZExtValue(); + auto *EE = cast(VL[I]); ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + Instruction::ExtractElement, EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -3631,14 +3671,15 @@ Idx = ReuseShuffleNumbers; for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Idx = IO->getZExtValue(); + auto *EE = cast(V); + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { --Idx; + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } CommonCost = ReuseShuffleCost; } else if (!E->ReorderIndices.empty()) { @@ -3647,12 +3688,9 @@ CommonCost = TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *EI = cast(VL[I]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(EI)) { + if (ShuffleOrOp == Instruction::ExtractValue) { + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + auto *EI = cast(VL[I]); // Take credit for instruction that will become dead. if (EI->hasOneUse()) { Instruction *Ext = EI->user_back(); @@ -3673,6 +3711,8 @@ CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } + } else { + AdjustExtractsCost(CommonCost, /*IsGather=*/false); } return CommonCost; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -145,22 +145,21 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 +; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -145,22 +145,21 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] -; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 -; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 +; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -12,20 +12,17 @@ ; CHECK-LABEL: @noop_extracts_first_2_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 ; CHECK-NEXT: [[V2_LANE_3:%.*]] = extractelement <4 x double> [[V_2]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 -; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 -; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V_1]], i32 0 -; CHECK-NEXT: call void @use(double [[TMP5]]) -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP6]]) +; CHECK-NEXT: [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]] +; CHECK-NEXT: [[A_LANE_1:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_3]] +; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[A_LANE_0]], i32 0 +; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1 +; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) +; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: store <2 x double> [[A_INS_1]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; @@ -230,22 +227,19 @@ ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_1]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[V2_LANE_0]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 2 -; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 3 -; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[A_LANE_2:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]] +; CHECK-NEXT: [[A_LANE_3:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_0]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP6]], i32 1 +; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2 +; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) @@ -294,17 +288,12 @@ ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 ; CHECK-NEXT: [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[A_LANE_1:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]] +; CHECK-NEXT: [[A_LANE_2:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]] ; CHECK-NEXT: [[A_LANE_3:%.*]] = fmul double [[V1_LANE_3]], [[V2_LANE_0]] ; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[A_LANE_0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 -; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[TMP6]], i32 2 +; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1 +; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2 ; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -110,22 +110,22 @@ ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( @@ -425,10 +425,10 @@ ; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 ; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 ; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 ; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; CHECK-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -110,22 +110,22 @@ ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll @@ -46,13 +46,13 @@ ; CHECK-NEXT: ret float [[X0]] ; ; THRESH1-LABEL: @f_used_out_of_tree( -; THRESH1-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; THRESH1-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1 -; THRESH1-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X0]] -; THRESH1-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]] -; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]] +; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 +; THRESH1-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]] +; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]] ; THRESH1-NEXT: store float [[ADD]], float* @a, align 4 -; THRESH1-NEXT: ret float [[X0]] +; THRESH1-NEXT: ret float [[TMP1]] ; ; THRESH2-LABEL: @f_used_out_of_tree( ; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0