diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -554,8 +554,11 @@ SmallVector UndefVectorExtracts; for (int I = 0, E = VL.size(); I < E; ++I) { auto *EI = dyn_cast(VL[I]); - if (!EI) + if (!EI) { + if (isa(VL[I])) + UndefVectorExtracts.push_back(I); continue; + } auto *VecTy = dyn_cast(EI->getVectorOperandType()); if (!VecTy || !isa(EI->getIndexOperand())) continue; @@ -1198,6 +1201,8 @@ /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. + /// \return std::nullopt if ordering is not important, empty order, if + /// identity order is important, or the actual order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. std::optional getReorderingData(const TreeEntry &TE, @@ -1444,8 +1449,14 @@ ConstantInt *Ex1Idx; if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { // Undefs are always profitable for extractelements. + // Compiler can easily combine poison and extractelement or + // undef and extractelement . But combining undef + + // extractelement requires some + // extra operations. if (isa(V2)) - return LookAheadHeuristics::ScoreConsecutiveExtracts; + return (isa(V2) || isUndefVector(EV1).all()) + ? LookAheadHeuristics::ScoreConsecutiveExtracts + : LookAheadHeuristics::ScoreSameOpcode; Value *EV2 = nullptr; ConstantInt *Ex2Idx = nullptr; if (match(V2, @@ -4169,6 +4180,41 @@ return std::move(CurrentOrder); } } + // If the gather node is and + // insertelement poison, v, 0 [+ permute] + // is cheaper than + // insertelement poison, v, n - try to reorder. + // If rotating the whole graph, exclude the permute cost, the whole graph + // might be transformed. + int Sz = TE.Scalars.size(); + if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) && + count_if(TE.Scalars, UndefValue::classof) == Sz - 1) { + const auto *It = + find_if(TE.Scalars, [](Value *V) { return !isConstant(V); }); + if (It == TE.Scalars.begin()) + return OrdersType(); + auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz); + if (It != TE.Scalars.end()) { + OrdersType Order(Sz, Sz); + unsigned Idx = std::distance(TE.Scalars.begin(), It); + Order[Idx] = 0; + fixupOrderingIndices(Order); + SmallVector Mask; + inversePermutation(Order, Mask); + InstructionCost PermuteCost = + TopToBottom + ? 0 + : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask); + InstructionCost InsertFirstCost = TTI->getVectorInstrCost( + Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0, + PoisonValue::get(Ty), *It); + InstructionCost InsertIdxCost = TTI->getVectorInstrCost( + Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, + PoisonValue::get(Ty), *It); + if (InsertFirstCost + PermuteCost < InsertIdxCost) + return std::move(Order); + } + } if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; if (TE.Scalars.size() >= 4) @@ -6939,9 +6985,10 @@ // Add broadcast for non-identity shuffle only. bool NeedShuffle = VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof); - InstructionCost InsertCost = - TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, - /*Index=*/0, PoisonValue::get(VecTy), *It); + InstructionCost InsertCost = TTI->getVectorInstrCost( + Instruction::InsertElement, VecTy, CostKind, + NeedShuffle ? 0 : std::distance(VL.begin(), It), + PoisonValue::get(VecTy), *It); return InsertCost + (NeedShuffle ? TTI->getShuffleCost( TargetTransformInfo::SK_Broadcast, VecTy, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,7 +10,7 @@ ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[SHUFFLE:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP10:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP5:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] @@ -27,27 +27,26 @@ ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[SHUFFLE1:%.*]], [[BB10]] ], [ [[TMP11:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[SHUFFLE]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[BB10]] ], [ [[TMP12:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup -; CHECK-NEXT: [[SHUFFLE1]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP11]] = phi <2 x i32> [ [[TMP6]], [[BB7]] ] +; CHECK-NEXT: [[TMP12]] = phi <2 x i32> [ [[TMP6]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll @@ -14,7 +14,7 @@ ; CHECK-LABEL: @Rf_GReset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @d, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> , [[TMP1]] ; CHECK-NEXT: br i1 icmp eq (ptr inttoptr (i64 115 to ptr), ptr @Rf_gpptr), label [[IF_THEN:%.*]], label [[IF_END7:%.*]] ; CHECK: if.then: @@ -22,7 +22,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], undef ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt double [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]] ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN6:%.*]], label [[IF_END7]] ; CHECK: if.then6: ; CHECK-NEXT: br label [[IF_END7]] @@ -55,7 +55,7 @@ ; CHECK-LABEL: @Rf_GReset_unary_fneg( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @d, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x double> [[TMP1]] ; CHECK-NEXT: br i1 icmp eq (ptr inttoptr (i64 115 to ptr), ptr @Rf_gpptr), label [[IF_THEN:%.*]], label [[IF_END7:%.*]] ; CHECK: if.then: @@ -63,7 +63,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], undef ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt double [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]] ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN6:%.*]], label [[IF_END7]] ; CHECK: if.then6: ; CHECK-NEXT: br label [[IF_END7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -62,19 +62,20 @@ ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 ; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; SSE-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 -; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP4]], <2 x i32> -; SSE-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], -; SSE-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP4]], -; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] -; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer +; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP5]], <2 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> +; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP5]], +; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <2 x i64> [[TMP12]], ptr [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -18,7 +18,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-19' +; YAML-NEXT: - Cost: '-20' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll @@ -13,21 +13,21 @@ define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr #0 { ; CHECK-LABEL: @root_selection( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 ; CHECK-NEXT: [[I09:%.*]] = fmul fast double [[TMP4]], undef ; CHECK-NEXT: [[I10:%.*]] = fsub fast double undef, [[I09]] -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[I10]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[I10]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = fdiv fast <2 x double> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 ; CHECK-NEXT: [[I07:%.*]] = fadd fast double undef, [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 ; CHECK-NEXT: [[I16:%.*]] = fadd fast double [[I07]], [[TMP12]] ; CHECK-NEXT: [[I17:%.*]] = fadd fast double [[I16]], [[C:%.*]] ; CHECK-NEXT: [[I18:%.*]] = fadd fast double [[I17]], [[D:%.*]]