diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -181,7 +181,10 @@ // Vector width with best cost ElementCount Width; // Cost of the loop with that width - unsigned Cost; + InstructionCost Cost; + + VectorizationFactor(ElementCount Width, InstructionCost Cost) + : Width(Width), Cost(Cost) {} // Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { @@ -195,6 +198,17 @@ bool operator!=(const VectorizationFactor &rhs) const { return !(*this == rhs); } + + // Returns true if this VectorizationFactor is more profitable than \p rhs. + bool isMoreProfitableThan(const VectorizationFactor &rhs) const { + InstructionCost::CostType ThisCost = *Cost.getValue(); + InstructionCost::CostType RhsCost = *rhs.Cost.getValue(); + // To avoid the need for FP division: + // (CostA / A.Width) < (CostB / B.Width) + // <=> (CostA * B.Width) < (CostB * A.Width) + return (ThisCost * rhs.Width.getKnownMinValue()) < + (RhsCost * Width.getKnownMinValue()); + } }; /// Planner drives the vectorization process after having passed diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5887,16 +5887,15 @@ LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - auto Width = ElementCount::getFixed(1); - const float ScalarCost = *ExpectedCost.getValue(); - float Cost = ScalarCost; + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); + VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && MaxVF.isVector()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. - Cost = std::numeric_limits::max(); + ChosenFactor.Cost = std::numeric_limits::max(); } for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); @@ -5905,10 +5904,14 @@ // we need to divide the cost of the vector loops by the width of // the vector elements. VectorizationCostTy C = expectedCost(i); + assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); - float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (int)VectorCost << ".\n"); + VectorizationFactor Candidate(i, C.first); + LLVM_DEBUG( + dbgs() << "LV: Vector loop of width " << i << " costs: " + << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) + << ".\n"); + if (!C.second && !ForceVectorization) { LLVM_DEBUG( dbgs() << "LV: Not considering vector loop of width " << i @@ -5917,32 +5920,27 @@ } // If profitable add it to ProfitableVF list. - if (VectorCost < ScalarCost) { - ProfitableVFs.push_back(VectorizationFactor( - {i, (unsigned)VectorCost})); - } + if (Candidate.isMoreProfitableThan(ScalarCost)) + ProfitableVFs.push_back(Candidate); - if (VectorCost < Cost) { - Cost = VectorCost; - Width = i; - } + if (Candidate.isMoreProfitableThan(ChosenFactor)) + ChosenFactor = Candidate; } if (!EnableCondStoresVectorization && NumPredStores) { reportVectorizationFailure("There are conditional stores.", "store that is conditionally executed prevents vectorization", "ConditionalStore", ORE, TheLoop); - Width = ElementCount::getFixed(1); - Cost = ScalarCost; + ChosenFactor = ScalarCost; } - LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() + LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && + *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) + dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); - VectorizationFactor Factor = {Width, - (unsigned)(Width.getKnownMinValue() * Cost)}; - return Factor; + LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); + return ChosenFactor; } bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( @@ -6055,7 +6053,8 @@ for (auto &NextVF : ProfitableVFs) if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && - (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && + (Result.Width.getFixedValue() == 1 || + NextVF.isMoreProfitableThan(Result)) && LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) Result = NextVF; @@ -9752,7 +9751,7 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); } // Identify the diagnostic messages that should be produced. diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll --- a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll @@ -10,6 +10,8 @@ ; CHECK-LABEL: f: ; CHECK: vmovdqu64 %zmm{{.}}, ; CHECK-NOT: %ymm +; CHECK: epilog +; CHECK: %ymm ; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to @@ -82,10 +84,14 @@ ; CHECK-LABEL: h: ; CHECK: vmovdqu64 %zmm{{.}}, ; CHECK-NOT: %ymm +; CHECK: epilog +; CHECK: %ymm ; CHECK-PREFER-AVX256-LABEL: h: ; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}}, ; CHECK-PREFER-AVX256-NOT: %ymm +; CHECK-PREFER-AVX256: epilog +; CHECK-PREFER-AVX256: %ymm define void @h(i32* %a, i32 %n) "prefer-vector-width"="512" { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -168,7 +168,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 127 @@ -225,7 +225,7 @@ ; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] ; CHECK-NEXT: [[CAST_CRD22:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END23:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD22]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 120 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 112 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: @@ -233,24 +233,24 @@ ; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[BLOCKSIZE]], -1 ; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 ; CHECK-NEXT: [[TMP24:%.*]] = add nuw nsw i64 [[TMP23]], 1 -; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP24]], 8589934584 +; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP24]], 8589934576 ; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] ; CHECK-NEXT: [[IND_END25:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]] ; CHECK-NEXT: [[IND_END28:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT35]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT35]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP32:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX20]] ; CHECK-NEXT: [[NEXT_GEP33:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX20]] -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP32]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 2 -; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> [[WIDE_LOAD34]], <8 x i8> [[WIDE_LOAD34]], <8 x i8> [[BROADCAST_SPLAT36]]) -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP33]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[TMP26]], <8 x i8>* [[TMP27]], align 2 -; CHECK-NEXT: [[INDEX_NEXT21]] = add i64 [[INDEX20]], 8 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP32]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <16 x i8>, <16 x i8>* [[TMP25]], align 2 +; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD34]], <16 x i8> [[WIDE_LOAD34]], <16 x i8> [[BROADCAST_SPLAT36]]) +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP33]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP26]], <16 x i8>* [[TMP27]], align 2 +; CHECK-NEXT: [[INDEX_NEXT21]] = add i64 [[INDEX20]], 16 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC19]] ; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: vec.epilog.middle.block: