Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -199,6 +199,11 @@ "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); +static cl::opt IgnoreOutOfLoopReductionCost( + "vectorizer-ignore-out-of-loop-reduction-cost", cl::init(false), + cl::desc("Ignore the cost of out-of-loop reductions in vectorizer cost " + "model")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -1654,6 +1659,9 @@ /// \p VF is the vectorization factor chosen for the original loop. bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// Returns total cost for out-of-loop reductions. + InstructionCost getOutOfLoopReductionCost(VectorizationFactor VF); + private: unsigned NumPredStores = 0; @@ -6505,6 +6513,72 @@ return Cost; } +InstructionCost +LoopVectorizationCostModel::getOutOfLoopReductionCost(VectorizationFactor VF) { + InstructionCost ReduxCost = 0; + if (VF.Width.isScalar() || IgnoreOutOfLoopReductionCost) + return ReduxCost; + + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + for (auto &Reduction : Legal->getReductionVars()) { + PHINode *Phi = Reduction.first; + auto *VectorTy = cast(ToVectorTy(Phi->getType(), VF.Width)); + const RecurrenceDescriptor &RdxDesc = Reduction.second; + if (isInLoopReduction(Phi)) + continue; + RecurKind RK = RdxDesc.getRecurrenceKind(); + auto FMF = RdxDesc.getFastMathFlags(); + switch (RK) { + case RecurKind::Add: + case RecurKind::Mul: + case RecurKind::Or: + case RecurKind::And: + case RecurKind::Xor: + case RecurKind::FAdd: + case RecurKind::FMul: { + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RK); + ReduxCost += + TTI.getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); + break; + } + case RecurKind::FMax: + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: { + Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK); + ReduxCost += TTI.getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); + break; + } + case RecurKind::FMulAdd: { + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RK); + ReduxCost += + TTI.getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); + // For a call to the llvm.fmuladd intrinsic we need to add the cost of a + // normal fmul instruction to the cost of the fadd reduction. + ReduxCost += + TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); + break; + } + case RecurKind::FAnyOf: + case RecurKind::IAnyOf: { + // This has the cost of vector.reduce.or, but may have other costs as + // well. FIXME: This recur kind does not have a well defined cost yet. + unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RecurKind::Or); + ReduxCost += + TTI.getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); + break; + } + default: + llvm_unreachable("Unexpected reduction operation!"); + } + } + return ReduxCost; +} + std::optional LoopVectorizationCostModel::getReductionPatternCost( Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { @@ -9723,14 +9797,28 @@ } } -static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, - VectorizationFactor &VF, - std::optional VScale, Loop *L, - ScalarEvolution &SE) { - InstructionCost CheckCost = Checks.getCost(); +// What makes the loop unprofitable to vectorize. +namespace OutOfLoopCost { +enum Reason { + None, // OutOfLoopCost is zero. + RuntimeCheck, + OutOfLoopReduction, + Some // Combination of above reasons: We have both runtime checks and out of + // loop reductions. +}; +} + +static OutOfLoopCost::Reason areOutOfLoopComputationsProfitable( + InstructionCost CheckCost, InstructionCost ReduxCost, + VectorizationFactor &VF, std::optional VScale, Loop *L, + ScalarEvolution &SE) { if (!CheckCost.isValid()) - return false; + return OutOfLoopCost::RuntimeCheck; + auto ReduxCostVal = *ReduxCost.getValue(); + double RtC = *CheckCost.getValue(); + if (!ReduxCostVal && !RtC) + return OutOfLoopCost::None; // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. if (VF.Width.isScalar()) { @@ -9738,15 +9826,21 @@ LLVM_DEBUG( dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"); - return false; + return OutOfLoopCost::RuntimeCheck; } - return true; + if (ReduxCostVal) { + LLVM_DEBUG(dbgs() << "Interleaving only is not profitable due to out of " + "loop reductions\n"); + return OutOfLoopCost::OutOfLoopReduction; + } + return OutOfLoopCost::None; } - // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. + // The scalar cost should only be 0 when vectorizing with a user specified + // VF/IC. In those cases, ignore out of loop costs. double ScalarC = *VF.ScalarCost.getValue(); if (ScalarC == 0) - return true; + return OutOfLoopCost::None; // First, compute the minimum iteration count required so that the vector // loop outperforms the scalar loop. @@ -9757,7 +9851,7 @@ // * ScalarC is the cost of a single scalar iteration. // // The total cost of the vector loop is - // RtC + VecC * (TC / VF) + EpiC + // RtC + VecC * (TC / VF) + EpiC + ReduxCost // where // * RtC is the cost of the generated runtime checks // * VecC is the cost of a single vector iteration. @@ -9765,13 +9859,15 @@ // * VF is the vectorization factor // * EpiCost is the cost of the generated epilogue, including the cost // of the remaining scalar operations. + // * ReduxCost is the cost of out-of-loop reductions which are executed if + // the vector loop is taken. // // Vectorization is profitable once the total vector cost is less than the // total scalar cost: - // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC + // RtC + VecC * (TC / VF) + EpiC +ReduxCost < ScalarC * TC // // Now we can compute the minimum required trip count TC as - // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // (RtC + EpiC + ReduxCost) / (ScalarC - (VecC / VF)) < TC // // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result @@ -9784,8 +9880,7 @@ IntVF *= AssumedMinimumVscale; } double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; - double RtC = *CheckCost.getValue(); - double MinTC1 = RtC / (ScalarC - VecCOverVF); + double MinTC1 = (RtC + ReduxCostVal) / (ScalarC - VecCOverVF); // Second, compute a minimum iteration count so that the cost of the // runtime checks is only a fraction of the total scalar loop cost. This @@ -9794,6 +9889,8 @@ // * TC. To bound the runtime check to be a fraction 1/X of the scalar // cost, compute // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC + // Note that we can ignore ReduxCost here since out-of-loop reductions are + // computed only if the vector loop is taken. double MinTC2 = RtC * 10 / ScalarC; // Now pick the larger minimum. If it is not a multiple of VF, choose the @@ -9802,9 +9899,9 @@ uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); - LLVM_DEBUG( - dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" - << VF.MinProfitableTripCount << "\n"); + LLVM_DEBUG(dbgs() << "LV: Minimum required TC for out-of-loop computations " + "to be profitable:" + << VF.MinProfitableTripCount << "\n"); // Skip vectorization if the expected trip count is less than the minimum // required trip count. @@ -9816,10 +9913,14 @@ << *ExpectedTC << " < " << VF.MinProfitableTripCount << ")\n"); - return false; + // If possible, return the exact reason we cannot vectorize the small trip + // count loop. + return (!RtC) ? OutOfLoopCost::OutOfLoopReduction + : !(ReduxCostVal) ? OutOfLoopCost::RuntimeCheck + : OutOfLoopCost::Some; } } - return true; + return OutOfLoopCost::None; } LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) @@ -10010,23 +10111,42 @@ if (VF.Width.isVector() || SelectedIC > 1) Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); - // Check if it is profitable to vectorize with runtime checks. + // Check if it is profitable to vectorize with out of loop computations + // (such as reductions and runtime checks). bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; - if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - *PSE.getSE())) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return false; - } + if (!ForceVectorization) { + InstructionCost RTCheckCost = Checks.getCost(); + InstructionCost ReduxCost = CM.getOutOfLoopReductionCost(VF); + + auto UnprofitableReason = areOutOfLoopComputationsProfitable( + RTCheckCost, ReduxCost, VF, getVScaleForTuning(L, *TTI), L, + *PSE.getSE()); + switch (UnprofitableReason) { + case OutOfLoopCost::None: + break; + case OutOfLoopCost::RuntimeCheck: { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + return false; + } + case OutOfLoopCost::OutOfLoopReduction: + LLVM_DEBUG(dbgs() << "LV: Costly out of loop reductions for small " + "trip count loop.\n"); + return false; + default: + LLVM_DEBUG(dbgs() << "LV: Costly out of loop computation for small " + "trip count loop.\n"); + return false; + } + }// ForceVectorization } // Identify the diagnostic messages that should be produced. Index: llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: [[CMP_NOT16:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT16]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -4 @@ -86,7 +86,7 @@ ; CHECK-NEXT: [[ACCUM_1]] = phi float [ [[ADD4]], [[IF_THEN]] ], [ [[ACCUM_017]], [[WHILE_BODY]] ] ; CHECK-NEXT: [[DEC]] = add i32 [[BLOCKSIZE_ADDR_018]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[ACCUM_0_LCSSA]] Index: llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -34,7 +34,7 @@ ; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, ptr [[T7]], i32 [[T2]] ; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[ARRAYIDX113]], align 4 ; CHECK-NEXT: [[CONV114:%.*]] = fpext float [[T8]] to double -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[T]], 2 @@ -88,7 +88,7 @@ ; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]] ; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1 ; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]] -; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: outerend: ; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32 Index: llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll @@ -16,41 +16,41 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]]) -; CHECK-NEXT: [[TMP3]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]) +; CHECK-NEXT: [[TMP1]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]]) +; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -2147483648, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -2147483648, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[COND9:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i32 [[MIN_0_LCSSA]], ptr [[MINP:%.*]], align 4 ; CHECK-NEXT: ret i32 [[MAX_0_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[MIN_028:%.*]] = phi i32 [ [[TMP9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[MAX_027:%.*]] = phi i32 [ [[TMP8]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[MIN_028:%.*]] = phi i32 [ [[COND9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[MAX_027:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_029]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP8]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[MAX_027]]) -; CHECK-NEXT: [[TMP9]] = call i32 @llvm.smin.i32(i32 [[TMP7]], i32 [[MIN_028]]) +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[COND]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[MAX_027]]) +; CHECK-NEXT: [[COND9]] = call i32 @llvm.smin.i32(i32 [[TMP6]], i32 [[MIN_028]]) ; CHECK-NEXT: [[INC]] = add nuw i32 [[I_029]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %cmp26.not = icmp eq i32 %N, 0 Index: llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll @@ -78,34 +78,35 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7]] = add [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -117,7 +118,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -14,48 +14,49 @@ ; OUTLOOP: for.body.preheader: ; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 -; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] +; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]]) +; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP2]] ; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OUTLOOP: vector.ph: -; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 -; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] +; OUTLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; OUTLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 +; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP4]] ; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 -; OUTLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] -; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 -; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to -; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] -; OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 -; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]] -; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 +; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP5]] +; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 +; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 2 +; OUTLOOP-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; OUTLOOP-NEXT: [[TMP9]] = add [[VEC_PHI]], [[TMP8]] +; OUTLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() +; OUTLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 +; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]] +; OUTLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; OUTLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OUTLOOP: middle.block: -; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP8]]) +; OUTLOOP-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: ; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; OUTLOOP: for.body: ; OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; OUTLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 +; OUTLOOP-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 ; OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OUTLOOP: for.cond.cleanup.loopexit: -; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; OUTLOOP: for.cond.cleanup: ; OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -402,36 +402,37 @@ ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]]) +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; VLENUNK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] -; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 -; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; VLENUNK-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; VLENUNK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; VLENUNK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; VLENUNK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLENUNK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; VLENUNK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLENUNK: middle.block: -; VLENUNK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) +; VLENUNK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLENUNK: scalar.ph: ; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: ; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -445,43 +446,44 @@ ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLENUNK: for.end: -; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; ; VLEN128-LABEL: @indexed_load( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]]) +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; VLEN128-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] -; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 -; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; VLEN128-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; VLEN128-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLEN128-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; VLEN128-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; VLEN128-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; VLEN128-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLEN128-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; VLEN128-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLEN128: middle.block: -; VLEN128-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) +; VLEN128-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLEN128: scalar.ph: ; VLEN128-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: br label [[FOR_BODY:%.*]] ; VLEN128: for.body: ; VLEN128-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -495,7 +497,7 @@ ; VLEN128-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLEN128-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLEN128: for.end: -; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/ctpop-small-trip-count.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/ctpop-small-trip-count.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -S -passes=loop-vectorize -mcpu=znver2 -vectorizer-ignore-out-of-loop-reduction-cost=0 -force-vector-interleave=1 < %s | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; PR 57476 +; Hard-coded value of trip count being 2. +; FIXME: We still vectorize it, since reduction cost is 1 (for VF=2). +define i64 @test_trip_count_2(ptr %arr) { +; CHECK-LABEL: define i64 @test_trip_count_2( +; CHECK-SAME: ptr [[ARR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4]] = add <2 x i64> [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP4]]) +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ACCUM_NEXT:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[IV]] +; CHECK-NEXT: [[VALUE:%.*]] = load i64, ptr [[GEP]], align 8 +; CHECK-NEXT: [[CTPOP:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[VALUE]]) +; CHECK-NEXT: [[ACCUM_NEXT]] = add i64 [[ACCUM]], [[CTPOP]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[LCSSA:%.*]] = phi i64 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[LCSSA]] +; +entry: + br label %loop + +loop: + %accum = phi i64 [ %accum.next, %loop ], [ 0, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %iv.next = add nuw i64 %iv, 1 + %gep = getelementptr inbounds i64, ptr %arr, i64 %iv + %value = load i64, ptr %gep, align 8 + %ctpop = tail call i64 @llvm.ctpop.i64(i64 %value) + %accum.next = add i64 %accum, %ctpop + %exitcond = icmp eq i64 %iv.next, 2 + br i1 %exitcond, label %exit, label %loop + +exit: + %lcssa = phi i64 [ %accum.next, %loop ] + ret i64 %lcssa +} + +; Same loop as above, with profile showing trip count of 2. +; We do not vectorize this when we consider cost of reductions since reduction +; cost along with vectorcost (with VF=4) is higher than scalar cost. +define i64 @test_trip_count_prof_2(ptr %arr, i64 %n) { +; CHECK-LABEL: define i64 @test_trip_count_prof_2( +; CHECK-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ACCUM_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[IV]] +; CHECK-NEXT: [[VALUE:%.*]] = load i64, ptr [[GEP]], align 8 +; CHECK-NEXT: [[CTPOP:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[VALUE]]) +; CHECK-NEXT: [[ACCUM_NEXT]] = add i64 [[ACCUM]], [[CTPOP]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]], !prof [[PROF4:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[LCSSA:%.*]] = phi i64 [ [[ACCUM_NEXT]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[LCSSA]] +; +entry: + br label %loop + +loop: + %accum = phi i64 [ %accum.next, %loop ], [ 0, %entry ] + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %iv.next = add nuw i64 %iv, 1 + %gep = getelementptr inbounds i64, ptr %arr, i64 %iv + %value = load i64, ptr %gep, align 8 + %ctpop = tail call i64 @llvm.ctpop.i64(i64 %value) + %accum.next = add i64 %accum, %ctpop + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop, !prof !2 + +exit: + %lcssa = phi i64 [ %accum.next, %loop ] + ret i64 %lcssa +} +declare i64 @llvm.ctpop.i64(i64) + +!2 = !{!"branch_weights", i32 1, i32 2} Index: llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll @@ -0,0 +1,94 @@ +; RUN: opt -S -passes=loop-vectorize,dce -mcpu=skylake -vectorizer-ignore-out-of-loop-reduction-cost=0 -force-vector-interleave=1 < %s | FileCheck %s +target triple = "x86_64-unknown-linux-gnu" + +declare float @llvm.maximum.f32(float, float) +declare float @llvm.fabs.f32(float) + +; This is a small trip count loop. The cost of the out-of-loop reduction is +; significant in this case when we only perform a single vector iteration. +; However, loop vectorizer does not consider out of loop reduction costs. + +; CHECK-LABEL: fmaximum_intrinsic +; CHECK-NOT: llvm.vector.reduce.fmaximum +define float @fmaximum_intrinsic(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n, i32 %tc) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.012 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %s.011 = phi float [ 0.000000e+00, %entry ], [ %max, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.012 + %x_f = load float, ptr %arrayidx, align 4 + %arrayidxy = getelementptr inbounds float, ptr %y, i32 %i.012 + %y_f = load float, ptr %arrayidxy, align 4 + %sub = fsub float %x_f, %y_f + %fabs = call float @llvm.fabs.f32(float %sub) + %max = tail call float @llvm.maximum.f32(float %s.011, float %fabs) + %inc = add nuw nsw i32 %i.012, 1 + %exitcond = icmp ult i32 %inc, 3 + br i1 %exitcond, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret float %max +} + +; trip count of 6 is still considered non-profitable for reducing adds (min trip +; count required is 8). +; CHECK-LABEL: reduction_sum +; CHECK-NOT: llvm.vector.reduce.add +define i32 @reduction_sum(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %3 = load i32, ptr %2, align 4 + %4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %5 = load i32, ptr %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = add i32 %sum.02, %6 + %8 = add i32 %7, %3 + %9 = add i32 %8, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph, !prof !1 + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +; CHECK-LABEL: reduction_mix +; CHECK-LABEL: middle.block: +; CHECK-NEXT: vector.reduce.add +; CHECK-NEXT: br +define i32 @reduction_mix(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %3 = load i32, ptr %2, align 4 + %4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %5 = load i32, ptr %4, align 4 + %6 = mul nsw i32 %5, %3 + %7 = trunc i64 %indvars.iv to i32 + %8 = add i32 %sum.02, %7 + %9 = add i32 %8, %6 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph, !prof !2 + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +!1 = !{!"branch_weights", i32 1, i32 5} +!2 = !{!"branch_weights", i32 1, i32 7}