diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2255,7 +2255,9 @@ TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { - return getVectorInstrCostHelper(nullptr, Val, Index, false /* HasRealUse */); + return getVectorInstrCostHelper(nullptr, Val, Index, + Opcode == Instruction::InsertElement && Op0 && + !isa(Op0)); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2459,14 +2459,6 @@ /// for ease of later optimization. Value *createBuildVector(const TreeEntry *E); - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. If \p - /// NeedToShuffle is true, need to add a cost of reshuffling some of the - /// vector elements. - InstructionCost getGatherCost(FixedVectorType *Ty, - const APInt &ShuffledIndices, - bool NeedToShuffle) const; - /// Returns the instruction in the bundle, which can be used as a base point /// for scheduling. Usually it is the last instruction in the bundle, except /// for the case when all operands are external (in this case, it is the first @@ -2488,7 +2480,8 @@ /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - InstructionCost getGatherCost(ArrayRef VL) const; + /// \param ForPoisonSrc true if initial vector is poison, false otherwise. + InstructionCost getGatherCost(ArrayRef VL, bool ForPoisonSrc) const; /// Set the Builder insert point to one after the last instruction in /// the bundle @@ -6922,9 +6915,10 @@ /*SubTp=*/nullptr, /*Args=*/*It) : TTI::TCC_Free); } - return GatherCost + (all_of(Gathers, UndefValue::classof) - ? TTI::TCC_Free - : R.getGatherCost(Gathers)); + return GatherCost + + (all_of(Gathers, UndefValue::classof) + ? TTI::TCC_Free + : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); }; public: @@ -7176,23 +7170,22 @@ GatheredScalars.front()->getType(), GatheredScalars.size()))); return Estimator.finalize(E->ReuseShuffleIndices); } - if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { + InstructionCost Cost = 0; + if (ExtractShuffle) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. - InstructionCost Cost = - computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); - return Cost + Estimator.finalize(E->ReuseShuffleIndices); + Cost += computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); } Estimator.gather( GatheredScalars, - (ExtractShuffle || GatherShuffle) - ? Constant::getNullValue(FixedVectorType::get( - GatheredScalars.front()->getType(), GatheredScalars.size())) - : nullptr); - return Estimator.finalize(E->ReuseShuffleIndices); + VL.equals(GatheredScalars) + ? nullptr + : Constant::getNullValue(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size()))); + return Cost + Estimator.finalize(E->ReuseShuffleIndices); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -8791,19 +8784,8 @@ return std::nullopt; } -InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, - const APInt &ShuffledIndices, - bool NeedToShuffle) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost Cost = - TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, - /*Extract*/ false, CostKind); - if (NeedToShuffle) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); - return Cost; -} - -InstructionCost BoUpSLP::getGatherCost(ArrayRef VL) const { +InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, + bool ForPoisonSrc) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) @@ -8815,20 +8797,36 @@ // shuffle candidates. APInt ShuffledElements = APInt::getZero(VL.size()); DenseSet UniqueElements; - // Iterate in reverse order to consider insert elements with the high cost. - for (unsigned I = VL.size(); I > 0; --I) { - unsigned Idx = I - 1; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost Cost; + auto EstimateInsertCost = [&](unsigned I, Value *V) { + if (!ForPoisonSrc) + Cost += + TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, + I, Constant::getNullValue(VecTy), V); + }; + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + Value *V = VL[I]; // No need to shuffle duplicates for constants. - if (isConstant(VL[Idx])) { - ShuffledElements.setBit(Idx); + if ((ForPoisonSrc && isConstant(V)) || isa(V)) { + ShuffledElements.setBit(I); continue; } - if (!UniqueElements.insert(VL[Idx]).second) { + if (!UniqueElements.insert(V).second) { DuplicateNonConst = true; - ShuffledElements.setBit(Idx); + ShuffledElements.setBit(I); + continue; } + EstimateInsertCost(I, V); } - return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); + if (ForPoisonSrc) + Cost = + TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true, + /*Extract*/ false, CostKind); + if (DuplicateNonConst) + Cost += + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + return Cost; } // Perform operand reordering on the instructions in VL and return the reordered diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -3,21 +3,27 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> , <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; CHECK-NEXT: br label [[TMP11:%.*]] -; CHECK: 11: -; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> -; CHECK-NEXT: br label [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; CHECK-NEXT: br label [[TMP17:%.*]] +; CHECK: 17: +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] +; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 +; CHECK-NEXT: br label [[TMP17]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll @@ -6,21 +6,18 @@ ; CHECK-LABEL: define i64 @fshl ; CHECK-SAME: (i64 [[OR1:%.*]], i64 [[OR2:%.*]], i64 [[OR3:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[OR2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[OR3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <2 x i64> ) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> , i64 [[OR1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> ) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> , <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i64> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 -; CHECK-NEXT: [[ADD3:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[XOR5:%.*]] = xor i64 [[ADD3]], [[TMP12]] +; CHECK-NEXT: [[OR4:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR2]], i64 0, i64 1) +; CHECK-NEXT: [[XOR1:%.*]] = xor i64 [[OR4]], 0 +; CHECK-NEXT: [[OR5:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR3]], i64 0, i64 2) +; CHECK-NEXT: [[XOR2:%.*]] = xor i64 [[OR5]], [[OR1]] +; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[XOR1]], [[OR1]] +; CHECK-NEXT: [[ADD2:%.*]] = add i64 0, [[XOR2]] +; CHECK-NEXT: [[OR6:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR1]], i64 [[OR2]], i64 17) +; CHECK-NEXT: [[XOR3:%.*]] = xor i64 [[OR6]], [[ADD1]] +; CHECK-NEXT: [[OR7:%.*]] = tail call i64 @llvm.fshl.i64(i64 0, i64 0, i64 21) +; CHECK-NEXT: [[XOR4:%.*]] = xor i64 [[OR7]], [[ADD2]] +; CHECK-NEXT: [[ADD3:%.*]] = or i64 [[XOR3]], [[ADD2]] +; CHECK-NEXT: [[XOR5:%.*]] = xor i64 [[ADD3]], [[XOR4]] ; CHECK-NEXT: ret i64 [[XOR5]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -95,33 +95,30 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { ; SSE-LABEL: @logical_and_icmp_diff_preds( ; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 ; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]] -; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false -; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 -; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false -; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false +; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false ; SSE-NEXT: ret i1 [[S3]] ; ; AVX-LABEL: @logical_and_icmp_diff_preds( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> , <4 x i32> -; AVX-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 -; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false -; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false -; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 -; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0 +; AVX-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false ; AVX-NEXT: ret i1 [[S3]] ; %x0 = extractelement <4 x i32> %x, i32 0