diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1623,10 +1623,16 @@ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, bool &NeedToScalarize) const; + /// TODO: + std::pair + calculateMainEpilogTripCount(const VectorizationFactor &VF, + unsigned TotalTripCount) const; + /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; + const VectorizationFactor &B, + const bool IsMainLoop) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -6036,25 +6042,59 @@ return MaxVF; } -bool LoopVectorizationCostModel::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B) const { - InstructionCost CostA = A.Cost; - InstructionCost CostB = B.Cost; +std::pair +LoopVectorizationCostModel::calculateMainEpilogTripCount( + const VectorizationFactor &VF, unsigned TotalTripCount) const { + uint64_t MainTripCount = TotalTripCount / VF.Width.getKnownMinValue(); + uint64_t EpilogTripCount = TotalTripCount % VF.Width.getKnownMinValue(); + + // In "foldTailByMasking" mode remainder iterations are executed as part of + // the main vector loop. That means all remainder iterations will be executed + // as one masked vector iterations. + if (EpilogTripCount != 0 && foldTailByMasking()) { + ++MainTripCount; + EpilogTripCount = 0; + } + + // In "requiresScalarEpilogue" mode there should be at least one iteration + // executed in the epilogue loop. If all iterations end up being executed + // as part of the main vector loop forward one vector iteration to remainder + // loop. + if (EpilogTripCount == 0 && requiresScalarEpilogue(VF.Width)) { + assert(MainTripCount > 0 && "Trip count is expected to be > 0"); + --MainTripCount; + EpilogTripCount = VF.Width.getKnownMinValue(); + } + return {MainTripCount, EpilogTripCount}; +} + +bool LoopVectorizationCostModel::isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B, + const bool IsMainLoop) const { + unsigned MainTripCountA = 0; + unsigned EpilogueTripCountA = 0; + unsigned MainTripCountB = 0; + unsigned EpilogueTripCountB = 0; + APInt FixedCostDiff{128, 0, true}; + APInt MainCostDiff{128, 0, true}; + APInt EpilogueCostDiff{128, 0, true}; + + // Invalid cost is less profitable than any other cost (valid or invalid). + if (!A.Cost.isValid()) + return false; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + // Valid cost (A.Cost is valid at this point) is more profitable than any + // invalid cost. + if (!B.Cost.isValid()) { + return true; + } - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && - MaxTripCount) { - // If we are folding the tail and the trip count is a known (possibly small) - // constant, the trip count will be rounded up to an integer number of - // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), - // which we compare directly. When not folding the tail, the total cost will - // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is - // approximated with the per-lane cost below instead of using the tripcount - // as here. - auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); - auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); - return RTCostA < RTCostB; + if (IsMainLoop) { + InstructionCost RTChecksCost = 0; + InstructionCost FixedCostA = A.Width.isScalar() ? 0 : RTChecksCost; + InstructionCost FixedCostB = B.Width.isScalar() ? 0 : RTChecksCost; + FixedCostDiff = {128, (uint64_t)(*(FixedCostB - FixedCostA).getValue()), + true}; } // When set to preferred, for now assume vscale may be larger than 1, so @@ -6062,14 +6102,37 @@ // vectorization. if (Hints->isScalableVectorizationPreferred()) if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getKnownMinValue()) <= - (CostB * A.Width.getKnownMinValue()); - - // To avoid the need for FP division: - // (CostA / A.Width) < (CostB / B.Width) - // <=> (CostA * B.Width) < (CostB * A.Width) - return (CostA * B.Width.getKnownMinValue()) < - (CostB * A.Width.getKnownMinValue()); + FixedCostDiff += 1; + + auto MaybeTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + unsigned ExpectedTC = + MaybeTC ? *MaybeTC : std::numeric_limits::max(); + std::tie(MainTripCountA, EpilogueTripCountA) = + calculateMainEpilogTripCount(A, ExpectedTC); + std::tie(MainTripCountB, EpilogueTripCountB) = + calculateMainEpilogTripCount(B, ExpectedTC); + + MainCostDiff = + APInt{128, (uint64_t)(*B.Cost.getValue()), true} * MainTripCountB - + APInt{128, (uint64_t)(*A.Cost.getValue()), true} * MainTripCountA; + EpilogueCostDiff = + APInt{128, (uint64_t)(*getScalarVF().Cost.getValue()), true} * + APInt{128, (uint64_t)((int64_t)EpilogueTripCountB - + (int64_t)EpilogueTripCountA), + true}; + + auto ResDiff = FixedCostDiff + MainCostDiff + EpilogueCostDiff; + bool IsProfitable = ResDiff.sgt(0); + + LLVM_DEBUG(dbgs() << "LV: Vectorization of the " + << (IsMainLoop ? "main" : "epilogue") << " loop of width " + << A.Width.getKnownMinValue() + << (IsProfitable ? " is more" : " is less") + << " profitable than of width " + << B.Width.getKnownMinValue() + << " by: " << (int64_t)ResDiff.getLimitedValue(64) << "\n"); + + return IsProfitable; } VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( @@ -6112,7 +6175,7 @@ // during epilogue vectorization cost modeling. CachedVFs.push_back(Candidate); - if (isMoreProfitable(Candidate, ChosenFactor)) + if (isMoreProfitable(Candidate, ChosenFactor, true)) ChosenFactor = Candidate; } @@ -6309,7 +6372,7 @@ if (NextVF.Width.isScalar()) continue; if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && - isMoreProfitable(NextVF, Result) && + isMoreProfitable(NextVF, Result, false) && LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) Result = NextVF; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -27,23 +27,23 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -55,7 +55,7 @@ ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -112,7 +112,7 @@ ; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -128,7 +128,7 @@ ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -183,7 +183,7 @@ ; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -200,7 +200,7 @@ ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;