Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -581,13 +581,28 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const; + /// The Preferred Vectorization Predication scheme of the target for the given + /// loop. + /// - ScalarEpilogue: Don't attempt to predicate the vector body (or epilog) + /// - PredicateElseEpilogue: Attempt to predicate the body else fall back to + /// unpredicated. + /// - PredicateOrDontVectorize: Attempt to predicate the body else don't + /// vectorize. + /// - UsePredicatedEpilogue: Use an unpredicated vector body with a predicated + /// remainder. + enum class PreferPredicateTy { + ScalarEpilogue, + PredicateElseEpilogue, + PredicateOrDontVectorize, + UsePredicatedEpilogue, + }; + /// Query the target whether it would be prefered to create a predicated /// vector loop, which can avoid the need to emit a scalar epilogue loop. - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) const; + PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) const; /// Query the target what the preferred style of tail folding is. /// \param IVUpdateMayOverflow Tells whether it is known if the IV update @@ -1700,15 +1715,14 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) = 0; - virtual bool - preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) = 0; + virtual PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) = 0; virtual TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0; - virtual std::optional instCombineIntrinsic( - InstCombiner &IC, IntrinsicInst &II) = 0; + virtual std::optional + instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; virtual std::optional simplifyDemandedUseBitsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits & Known, bool &KnownBitsComputed) = 0; @@ -2106,12 +2120,11 @@ HardwareLoopInfo &HWLoopInfo) override { return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) override { - return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); + TTI::PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) override { + return Impl.getPreferredVectorPredication(L, LI, SE, AC, TLI, DT, LVL, IAI); } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -163,12 +163,11 @@ return false; } - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) const { - return false; + TTI::PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) const { + return TTI::PreferPredicateTy::ScalarEpilogue; } TailFoldingStyle Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -622,12 +622,12 @@ return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) { - return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); + TTI::PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) { + return BaseT::getPreferredVectorPredication(L, LI, SE, AC, TLI, DT, LVL, + IAI); } TailFoldingStyle Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -313,11 +313,12 @@ return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } -bool TargetTransformInfo::preferPredicateOverEpilogue( +TTI::PreferPredicateTy TargetTransformInfo::getPreferredVectorPredication( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) const { - return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); + return TTIImpl->getPreferredVectorPredication(L, LI, SE, AC, TLI, DT, LVL, + IAI); } TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle( Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -356,11 +356,10 @@ return TailFoldingStyle::DataWithoutLaneMask; } - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI); + TTI::PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI); bool supportsScalableVectors() const { return ST->hasSVE(); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3390,18 +3390,18 @@ return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } -bool AArch64TTIImpl::preferPredicateOverEpilogue( +TTI::PreferPredicateTy AArch64TTIImpl::getPreferredVectorPredication( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) { if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; // We don't currently support vectorisation with interleaving for SVE - with // such loops we're better off not using tail-folding. This gives us a chance // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. if (IAI->hasGroups()) - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; TailFoldingKind Required; // Defaults to 0. if (LVL->getReductionVars().size()) @@ -3411,7 +3411,9 @@ if (!Required) Required.add(TailFoldingKind::TFSimple); - return (TailFoldingKindLoc & Required) == Required; + return (TailFoldingKindLoc & Required) == Required + ? TTI::PreferPredicateTy::PredicateElseEpilogue + : TTI::PreferPredicateTy::ScalarEpilogue; } InstructionCost Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -300,14 +300,12 @@ bool maybeLoweredToCall(Instruction &I); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, - AssumptionCache &AC, - TargetLibraryInfo *LibInfo, + AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI); + TTI::PreferPredicateTy getPreferredVectorPredication( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2238,52 +2238,55 @@ return true; } -bool ARMTTIImpl::preferPredicateOverEpilogue( +TTI::PreferPredicateTy ARMTTIImpl::getPreferredVectorPredication( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) { if (!EnableTailPredication) { LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; } // Creating a predicated vector loop is the first step for generating a // tail-predicated hardware loop, for which we need the MVE masked // load/stores instructions: if (!ST->hasMVEIntegerOps()) - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; // For now, restrict this to single block loops. if (L->getNumBlocks() > 1) { - LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " + LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: not a single block " "loop.\n"); - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; } - assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); + assert(L->isInnermost() && + "getPreferredVectorPredication: inner-loop expected"); HardwareLoopInfo HWLoopInfo(L); if (!HWLoopInfo.canAnalyze(*LI)) { - LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: hardware-loop is not " "analyzable.\n"); - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; } // This checks if we have the low-overhead branch architecture // extension, and if we will create a hardware-loop: if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { - LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: hardware-loop is not " "profitable.\n"); - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; } if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { - LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: hardware-loop is not " "a candidate.\n"); - return false; + return TTI::PreferPredicateTy::ScalarEpilogue; } - return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()); + if (!canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI())) + return TTI::PreferPredicateTy::ScalarEpilogue; + return TTI::PreferPredicateTy::PredicateElseEpilogue; } TailFoldingStyle Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -201,36 +201,32 @@ "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); -// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, -// that predication is preferred, and this lists all options. I.e., the -// vectorizer will try to fold the tail-loop (epilogue) into the vector body +// Option prefer-predicate-over-epilogue indicates that an epilogue is +// undesired, that predication is preferred, and this lists all options. I.e., +// the vectorizer will try to fold the tail-loop (epilogue) into the vector body // and predicate the instructions accordingly. If tail-folding fails, there are // different fallback strategies depending on these values: -namespace PreferPredicateTy { - enum Option { - ScalarEpilogue = 0, - PredicateElseScalarEpilogue, - PredicateOrDontVectorize - }; -} // namespace PreferPredicateTy -static cl::opt PreferPredicateOverEpilogue( +static cl::opt PreferPredicateOverEpilogue( "prefer-predicate-over-epilogue", - cl::init(PreferPredicateTy::ScalarEpilogue), - cl::Hidden, + cl::init(TTI::PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), - cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, - "scalar-epilogue", - "Don't tail-predicate loops, create scalar epilogue"), - clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, - "predicate-else-scalar-epilogue", - "prefer tail-folding, create scalar epilogue if tail " - "folding fails."), - clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, - "predicate-dont-vectorize", - "prefers tail-folding, don't attempt vectorization if " - "tail-folding fails."))); + cl::values( + clEnumValN(TTI::PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN( + TTI::PreferPredicateTy::PredicateElseEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create epilogue if tail folding fails."), + clEnumValN(TTI::PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."), + clEnumValN(TTI::PreferPredicateTy::UsePredicatedEpilogue, + "use-predicated-epilogue", + "Don't tail-predicate the loops but allow a predicated " + "epilogue."))); static cl::opt ForceTailFoldingStyle( "force-tail-folding-style", cl::desc("Force the tail folding style"), @@ -1141,27 +1137,32 @@ namespace llvm { -// Loop vectorization cost-model hints how the scalar epilogue loop should be +// Loop vectorization cost-model hints how the loop and epilogues should be // lowered. enum ScalarEpilogueLowering { - - // The default: allowing scalar epilogues. + // The default: allowing epilogues but don't attempt to predicate the vector + // body (FoldTailByMasking) or the remainder. CM_ScalarEpilogueAllowed, // Vectorization with OptForSize: don't allow epilogues. CM_ScalarEpilogueNotAllowedOptSize, - // A special case of vectorisation with OptForSize: loops with a very small - // trip count are considered for vectorization under OptForSize, thereby - // making sure the cost of their loop body is dominant, free of runtime - // guards and scalar iteration overheads. + // The same as CM_ScalarEpilogueNotAllowedOptSize but for a different reason + // (the trip count is known to be low) and the reported remarks are different. CM_ScalarEpilogueNotAllowedLowTripLoop, - // Loop hint predicate indicating an epilogue is undesired. + // Loop hint 'predicate' indicating predicating the entire loop might be + // profitable. Picks between predicated and unpredicated plans based on which + // is lower cost. The remainder can be predicated or scalar if required. CM_ScalarEpilogueNotNeededUsePredicate, // Directive indicating we must either tail fold or not vectorize - CM_ScalarEpilogueNotAllowedUsePredicate + CM_ScalarEpilogueNotAllowedUsePredicate, + + // An unpredicated main loop like CM_ScalarEpilogueAllowed, but allow + // predicated + // remainders if profitable. + CM_ScalarEpilogueAllowedUsePredRemainder, }; /// ElementCountComparator creates a total ordering for ElementCount @@ -1555,7 +1556,8 @@ bool isScalarEpilogueAllowed() const { return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || (!FoldTailByMasking && - ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate); + (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueAllowedUsePredRemainder)); } /// Returns the TailFoldingStyle that is best for the current loop. @@ -1580,6 +1582,13 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Return true when a predicated main loop is enabled, which is when + /// ScalarEpilogueStatus is not CM_ScalarEpilogueAllowedUsePredRemainder. + bool allowPredicatedVectorBody() const { + return FoldTailByMasking && + ScalarEpilogueStatus != CM_ScalarEpilogueAllowedUsePredRemainder; + } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -5085,6 +5094,13 @@ LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Trying predicated vector loop.\n"); break; + case CM_ScalarEpilogueAllowedUsePredRemainder: + // If this cost model is for unpredicated plans then generate them so that + // the loop body can be unpredicated and the remainder predicated. + if (!FoldTailByMasking) + return computeFeasibleMaxVF(TC, UserVF, false); + LLVM_DEBUG(dbgs() << "LV: Allowing predication for remainder loops.\n"); + break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize case CM_ScalarEpilogueNotAllowedOptSize: @@ -5523,7 +5539,9 @@ if (isMoreProfitable(Candidate, ScalarCost)) ProfitableVFs.push_back(Candidate); - if (isMoreProfitable(Candidate, ChosenFactor)) + if ((!Candidate.FoldTailByMasking || + VPlan->getCostModel()->allowPredicatedVectorBody()) && + isMoreProfitable(Candidate, ChosenFactor)) ChosenFactor = Candidate; } } @@ -5624,6 +5642,8 @@ ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (hasPlanWithVF(ForcedEC, false)) return {ForcedEC, false, 0, 0}; + else if (hasPlanWithVF(ForcedEC, true)) + return {ForcedEC, true, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -9787,12 +9807,14 @@ // 2) If set, obey the directives if (PreferPredicateOverEpilogue.getNumOccurrences()) { switch (PreferPredicateOverEpilogue) { - case PreferPredicateTy::ScalarEpilogue: + case TTI::PreferPredicateTy::ScalarEpilogue: return CM_ScalarEpilogueAllowed; - case PreferPredicateTy::PredicateElseScalarEpilogue: + case TTI::PreferPredicateTy::PredicateElseEpilogue: return CM_ScalarEpilogueNotNeededUsePredicate; - case PreferPredicateTy::PredicateOrDontVectorize: + case TTI::PreferPredicateTy::PredicateOrDontVectorize: return CM_ScalarEpilogueNotAllowedUsePredicate; + case TTI::PreferPredicateTy::UsePredicatedEpilogue: + return CM_ScalarEpilogueAllowedUsePredRemainder; }; } @@ -9804,11 +9826,29 @@ return CM_ScalarEpilogueAllowed; }; - // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) + // 4) if the TTI hook to get the preferred option for for the target, which + // defaults to CM_ScalarEpilogueAllowed. + TTI::PreferPredicateTy TargetPreferredPredication = + TTI->getPreferredVectorPredication(L, LI, *SE, *AC, TLI, DT, &LVL, IAI); + switch (TargetPreferredPredication) { + default: + llvm_unreachable("Unexpected PreferPredicateTy returned from the target!"); + case TTI::PreferPredicateTy::ScalarEpilogue: + LLVM_DEBUG(dbgs() << "LV: Target has picked ScalarEpilogue Predication\n"); + return CM_ScalarEpilogueAllowed; + case TTI::PreferPredicateTy::PredicateElseEpilogue: + LLVM_DEBUG( + dbgs() << "LV: Target has picked PredicateElseEpilogue Predication\n"); return CM_ScalarEpilogueNotNeededUsePredicate; - - return CM_ScalarEpilogueAllowed; + case TTI::PreferPredicateTy::PredicateOrDontVectorize: + LLVM_DEBUG(dbgs() << "LV: Target has picked PredicateOrDontVectorize " + "Predication\n"); + return CM_ScalarEpilogueNotAllowedUsePredicate; + case TTI::PreferPredicateTy::UsePredicatedEpilogue: + LLVM_DEBUG( + dbgs() << "LV: Target has picked UsePredicatedEpilogue Predication\n"); + return CM_ScalarEpilogueAllowedUsePredRemainder; + }; } Value *VPTransformState::get(VPValue *Def, unsigned Part) { Index: llvm/test/Transforms/LoopVectorize/ARM/epilog-predicated.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/ARM/epilog-predicated.ll @@ -0,0 +1,672 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,simplifycfg -S %s | FileCheck %s --check-prefix=DEFAULT +; RUN: opt -passes=loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck %s --check-prefix=PREDBODY +; RUN: opt -passes=loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=use-predicated-epilogue -S %s | FileCheck %s --check-prefix=PREDEPI + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-none-eabi" + +; By default we should vectorize with a predicated body for these loops. + +define noundef i32 @add(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) #0 { +; DEFAULT-LABEL: @add( +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DEFAULT-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; DEFAULT: vector.ph: +; DEFAULT-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; DEFAULT-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; DEFAULT-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], +; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP3]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; DEFAULT-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DEFAULT: for.cond.cleanup: +; DEFAULT-NEXT: ret i32 0 +; +; PREDBODY-LABEL: @add( +; PREDBODY-NEXT: entry: +; PREDBODY-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDBODY-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDBODY: vector.ph: +; PREDBODY-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDBODY-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDBODY-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDBODY-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDBODY: vector.body: +; PREDBODY-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDBODY-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) +; PREDBODY-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; PREDBODY-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; PREDBODY-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDBODY-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], +; PREDBODY-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]] +; PREDBODY-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; PREDBODY-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP3]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; PREDBODY-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; PREDBODY-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDBODY-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDBODY: for.cond.cleanup: +; PREDBODY-NEXT: ret i32 0 +; +; PREDEPI-LABEL: @add( +; PREDEPI-NEXT: entry: +; PREDEPI-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDEPI-NEXT: br i1 [[CMP6]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDEPI: vector.main.loop.iter.check: +; PREDEPI-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; PREDEPI-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDEPI: vector.ph: +; PREDEPI-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; PREDEPI-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; PREDEPI-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDEPI: vector.body: +; PREDEPI-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDEPI-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; PREDEPI-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; PREDEPI-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; PREDEPI-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; PREDEPI-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]] +; PREDEPI-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; PREDEPI-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 +; PREDEPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PREDEPI-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDEPI: middle.block: +; PREDEPI-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]] +; PREDEPI: vec.epilog.ph: +; PREDEPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; PREDEPI-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDEPI-NEXT: [[N_MOD_VF1:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDEPI-NEXT: [[N_VEC2:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF1]] +; PREDEPI-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; PREDEPI: vec.epilog.vector.body: +; PREDEPI-NEXT: [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP7:%.*]] = add i32 [[INDEX3]], 0 +; PREDEPI-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP7]], i32 [[N]]) +; PREDEPI-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP7]] +; PREDEPI-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; PREDEPI-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDEPI-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], +; PREDEPI-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[TMP7]] +; PREDEPI-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; PREDEPI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP10]], ptr [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; PREDEPI-NEXT: [[INDEX_NEXT4]] = add i32 [[INDEX3]], 4 +; PREDEPI-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT4]], [[N_VEC2]] +; PREDEPI-NEXT: br i1 [[TMP13]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; PREDEPI: for.cond.cleanup: +; PREDEPI-NEXT: ret i32 0 +; +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, ptr %y, i32 %indvars.iv + store i32 %add, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond.not = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret i32 0 +} + +define noundef i32 @interleave(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) #0 { +; DEFAULT-LABEL: @interleave( +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DEFAULT-NEXT: br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; DEFAULT: for.body.preheader: +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT: vector.ph: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]] +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; DEFAULT-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; DEFAULT-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]] +; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; DEFAULT-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; DEFAULT: middle.block: +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; DEFAULT: scalar.ph: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] +; DEFAULT: for.body: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP8:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP8]] +; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[TMP10:%.*]] = or i32 [[TMP8]], 1 +; DEFAULT-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP10]] +; DEFAULT-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; DEFAULT-NEXT: [[MUL4:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]] +; DEFAULT-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[INDVARS_IV]] +; DEFAULT-NEXT: store i32 [[MUL4]], ptr [[ARRAYIDX6]], align 4 +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT: for.cond.cleanup: +; DEFAULT-NEXT: ret i32 0 +; +; PREDBODY-LABEL: @interleave( +; PREDBODY-NEXT: entry: +; PREDBODY-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDBODY-NEXT: br i1 [[CMP12]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDBODY: vector.main.loop.iter.check: +; PREDBODY-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; PREDBODY-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDBODY: vector.ph: +; PREDBODY-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; PREDBODY-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; PREDBODY-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDBODY: vector.body: +; PREDBODY-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDBODY-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1 +; PREDBODY-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]] +; PREDBODY-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; PREDBODY-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; PREDBODY-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDBODY-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDBODY-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; PREDBODY-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]] +; PREDBODY-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; PREDBODY-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4 +; PREDBODY-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PREDBODY-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDBODY-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; PREDBODY: middle.block: +; PREDBODY-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; PREDBODY-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]] +; PREDBODY: vec.epilog.ph: +; PREDBODY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; PREDBODY-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDBODY-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDBODY-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; PREDBODY-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; PREDBODY-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; PREDBODY-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; PREDBODY-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; PREDBODY: vec.epilog.vector.body: +; PREDBODY-NEXT: [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDBODY-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDBODY-NEXT: [[TMP8:%.*]] = add i32 [[INDEX5]], 0 +; PREDBODY-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]]) +; PREDBODY-NEXT: [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], +; PREDBODY-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]] +; PREDBODY-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDBODY-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP9]], +; PREDBODY-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]] +; PREDBODY-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDBODY-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER6]], [[WIDE_MASKED_GATHER]] +; PREDBODY-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[TMP8]] +; PREDBODY-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; PREDBODY-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP13]], ptr [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; PREDBODY-NEXT: [[INDEX_NEXT7]] = add i32 [[INDEX5]], 4 +; PREDBODY-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; PREDBODY-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT7]], [[N_VEC3]] +; PREDBODY-NEXT: br i1 [[TMP16]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDBODY: for.cond.cleanup: +; PREDBODY-NEXT: ret i32 0 +; +; PREDEPI-LABEL: @interleave( +; PREDEPI-NEXT: entry: +; PREDEPI-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDEPI-NEXT: br i1 [[CMP12]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDEPI: vector.main.loop.iter.check: +; PREDEPI-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; PREDEPI-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDEPI: vector.ph: +; PREDEPI-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; PREDEPI-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; PREDEPI-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDEPI: vector.body: +; PREDEPI-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDEPI-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1 +; PREDEPI-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]] +; PREDEPI-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; PREDEPI-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; PREDEPI-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDEPI-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDEPI-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; PREDEPI-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]] +; PREDEPI-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; PREDEPI-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4 +; PREDEPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PREDEPI-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDEPI: middle.block: +; PREDEPI-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]] +; PREDEPI: vec.epilog.ph: +; PREDEPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; PREDEPI-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDEPI-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDEPI-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; PREDEPI-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; PREDEPI-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; PREDEPI-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; PREDEPI-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; PREDEPI: vec.epilog.vector.body: +; PREDEPI-NEXT: [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP8:%.*]] = add i32 [[INDEX5]], 0 +; PREDEPI-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]]) +; PREDEPI-NEXT: [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], +; PREDEPI-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]] +; PREDEPI-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDEPI-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP9]], +; PREDEPI-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]] +; PREDEPI-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDEPI-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER6]], [[WIDE_MASKED_GATHER]] +; PREDEPI-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[TMP8]] +; PREDEPI-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; PREDEPI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP13]], ptr [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; PREDEPI-NEXT: [[INDEX_NEXT7]] = add i32 [[INDEX5]], 4 +; PREDEPI-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; PREDEPI-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT7]], [[N_VEC3]] +; PREDEPI-NEXT: br i1 [[TMP16]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDEPI: for.cond.cleanup: +; PREDEPI-NEXT: ret i32 0 +; +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %0 = shl nuw nsw i32 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %0 + %1 = load i32, ptr %arrayidx, align 4 + %2 = or i32 %0, 1 + %arrayidx3 = getelementptr inbounds i32, ptr %x, i32 %2 + %3 = load i32, ptr %arrayidx3, align 4 + %mul4 = mul nsw i32 %3, %1 + %arrayidx6 = getelementptr inbounds i32, ptr %y, i32 %indvars.iv + store i32 %mul4, ptr %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond.not = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret i32 0 +} + +define noundef i32 @reduce_add(ptr nocapture noundef readonly %x, i32 noundef %n) #0 { +; DEFAULT-LABEL: @reduce_add( +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DEFAULT-NEXT: br i1 [[CMP4]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; DEFAULT: vector.ph: +; DEFAULT-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; DEFAULT-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; DEFAULT-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; DEFAULT-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT: for.cond.cleanup: +; DEFAULT-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: ret i32 [[S_0_LCSSA]] +; +; PREDBODY-LABEL: @reduce_add( +; PREDBODY-NEXT: entry: +; PREDBODY-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDBODY-NEXT: br i1 [[CMP4]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDBODY: vector.ph: +; PREDBODY-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDBODY-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDBODY-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDBODY-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDBODY: vector.body: +; PREDBODY-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDBODY-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) +; PREDBODY-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; PREDBODY-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; PREDBODY-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDBODY-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; PREDBODY-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; PREDBODY-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] +; PREDBODY-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; PREDBODY-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDBODY-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDBODY: for.cond.cleanup: +; PREDBODY-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: ret i32 [[S_0_LCSSA]] +; +; PREDEPI-LABEL: @reduce_add( +; PREDEPI-NEXT: entry: +; PREDEPI-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDEPI-NEXT: br i1 [[CMP4]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDEPI: vector.main.loop.iter.check: +; PREDEPI-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; PREDEPI-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDEPI: vector.ph: +; PREDEPI-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; PREDEPI-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; PREDEPI-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDEPI: vector.body: +; PREDEPI-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDEPI-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDEPI-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]] +; PREDEPI-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; PREDEPI-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; PREDEPI-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; PREDEPI-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] +; PREDEPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PREDEPI-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; PREDEPI: middle.block: +; PREDEPI-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]] +; PREDEPI: vec.epilog.ph: +; PREDEPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; PREDEPI-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; PREDEPI-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDEPI-NEXT: [[N_MOD_VF1:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDEPI-NEXT: [[N_VEC2:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF1]] +; PREDEPI-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; PREDEPI: vec.epilog.vector.body: +; PREDEPI-NEXT: [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP6:%.*]] = add i32 [[INDEX3]], 0 +; PREDEPI-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP6]], i32 [[N]]) +; PREDEPI-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP6]] +; PREDEPI-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; PREDEPI-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDEPI-NEXT: [[TMP9:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; PREDEPI-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; PREDEPI-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI4]] +; PREDEPI-NEXT: [[INDEX_NEXT5]] = add i32 [[INDEX3]], 4 +; PREDEPI-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT5]], [[N_VEC2]] +; PREDEPI-NEXT: br i1 [[TMP12]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; PREDEPI: for.cond.cleanup: +; PREDEPI-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: ret i32 [[S_0_LCSSA]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %s.05 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %s.05 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond.not = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %s.0.lcssa +} + + +define noundef i32 @reduce_interleave(ptr nocapture noundef readonly %x, i32 noundef %n) #0 { +; DEFAULT-LABEL: @reduce_interleave( +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DEFAULT-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; DEFAULT: for.body.preheader: +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT: vector.ph: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1 +; DEFAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]] +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; DEFAULT-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; DEFAULT-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; DEFAULT-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; DEFAULT-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; DEFAULT-NEXT: [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; DEFAULT: middle.block: +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; DEFAULT: scalar.ph: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; DEFAULT-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] +; DEFAULT: for.body: +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; DEFAULT-NEXT: [[S_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD5:%.*]], [[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP8:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP8]] +; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[TMP10:%.*]] = or i32 [[TMP8]], 1 +; DEFAULT-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP10]] +; DEFAULT-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; DEFAULT-NEXT: [[MUL4:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]] +; DEFAULT-NEXT: [[ADD5]] = add nsw i32 [[MUL4]], [[S_012]] +; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; DEFAULT: for.cond.cleanup: +; DEFAULT-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; DEFAULT-NEXT: ret i32 [[S_0_LCSSA]] +; +; PREDBODY-LABEL: @reduce_interleave( +; PREDBODY-NEXT: entry: +; PREDBODY-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDBODY-NEXT: br i1 [[CMP11]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDBODY: vector.main.loop.iter.check: +; PREDBODY-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; PREDBODY-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDBODY: vector.ph: +; PREDBODY-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; PREDBODY-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; PREDBODY-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDBODY: vector.body: +; PREDBODY-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; PREDBODY-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDBODY-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1 +; PREDBODY-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]] +; PREDBODY-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; PREDBODY-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; PREDBODY-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDBODY-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDBODY-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; PREDBODY-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; PREDBODY-NEXT: [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]] +; PREDBODY-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PREDBODY-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDBODY-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; PREDBODY: middle.block: +; PREDBODY-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; PREDBODY-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]] +; PREDBODY: vec.epilog.ph: +; PREDBODY-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; PREDBODY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; PREDBODY-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDBODY-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDBODY-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; PREDBODY-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; PREDBODY-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; PREDBODY-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; PREDBODY-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; PREDBODY: vec.epilog.vector.body: +; PREDBODY-NEXT: [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDBODY-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDBODY-NEXT: [[VEC_PHI6:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDBODY-NEXT: [[TMP8:%.*]] = add i32 [[INDEX5]], 0 +; PREDBODY-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]]) +; PREDBODY-NEXT: [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], +; PREDBODY-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]] +; PREDBODY-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDBODY-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP9]], +; PREDBODY-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]] +; PREDBODY-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDBODY-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[WIDE_MASKED_GATHER]] +; PREDBODY-NEXT: [[TMP14:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer +; PREDBODY-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; PREDBODY-NEXT: [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI6]] +; PREDBODY-NEXT: [[INDEX_NEXT8]] = add i32 [[INDEX5]], 4 +; PREDBODY-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; PREDBODY-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT8]], [[N_VEC3]] +; PREDBODY-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; PREDBODY: for.cond.cleanup: +; PREDBODY-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDBODY-NEXT: ret i32 [[S_0_LCSSA]] +; +; PREDEPI-LABEL: @reduce_interleave( +; PREDEPI-NEXT: entry: +; PREDEPI-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; PREDEPI-NEXT: br i1 [[CMP11]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; PREDEPI: vector.main.loop.iter.check: +; PREDEPI-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; PREDEPI-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDEPI: vector.ph: +; PREDEPI-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; PREDEPI-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; PREDEPI-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDEPI: vector.body: +; PREDEPI-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDEPI-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; PREDEPI-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1 +; PREDEPI-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]] +; PREDEPI-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; PREDEPI-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; PREDEPI-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDEPI-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; PREDEPI-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; PREDEPI-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; PREDEPI-NEXT: [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]] +; PREDEPI-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; PREDEPI-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDEPI: middle.block: +; PREDEPI-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; PREDEPI-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]] +; PREDEPI: vec.epilog.ph: +; PREDEPI-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; PREDEPI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; PREDEPI-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; PREDEPI-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4 +; PREDEPI-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; PREDEPI-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; PREDEPI-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; PREDEPI-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; PREDEPI-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; PREDEPI: vec.epilog.vector.body: +; PREDEPI-NEXT: [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[VEC_PHI6:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: [[TMP8:%.*]] = add i32 [[INDEX5]], 0 +; PREDEPI-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]]) +; PREDEPI-NEXT: [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], +; PREDEPI-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]] +; PREDEPI-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDEPI-NEXT: [[TMP11:%.*]] = or <4 x i32> [[TMP9]], +; PREDEPI-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]] +; PREDEPI-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; PREDEPI-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[WIDE_MASKED_GATHER]] +; PREDEPI-NEXT: [[TMP14:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer +; PREDEPI-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; PREDEPI-NEXT: [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI6]] +; PREDEPI-NEXT: [[INDEX_NEXT8]] = add i32 [[INDEX5]], 4 +; PREDEPI-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; PREDEPI-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT8]], [[N_VEC3]] +; PREDEPI-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDEPI: for.cond.cleanup: +; PREDEPI-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_VECTOR_BODY]] ] +; PREDEPI-NEXT: ret i32 [[S_0_LCSSA]] +; +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %s.012 = phi i32 [ 0, %for.body.preheader ], [ %add5, %for.body ] + %0 = shl nuw nsw i32 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %0 + %1 = load i32, ptr %arrayidx, align 4 + %2 = or i32 %0, 1 + %arrayidx3 = getelementptr inbounds i32, ptr %x, i32 %2 + %3 = load i32, ptr %arrayidx3, align 4 + %mul4 = mul nsw i32 %3, %1 + %add5 = add nsw i32 %mul4, %s.012 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond.not = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi i32 [ 0, %entry ], [ %add5, %for.body ] + ret i32 %s.0.lcssa +} + +attributes #0 = { "target-features"="+mve" } Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -18,6 +18,7 @@ ; CHECK-NEXT: LV: Found an induction variable. ; CHECK-NEXT: LV: Did not find one integer induction var. ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! +; CHECK-NEXT: LV: Target has picked ScalarEpilogue Predication ; CHECK-NEXT: LV: Found trip count: 0 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. @@ -151,6 +152,7 @@ ; CHECK-NEXT: LV: Found FP op with unsafe algebra. ; CHECK-NEXT: LV: Did not find one integer induction var. ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! +; CHECK-NEXT: LV: Target has picked ScalarEpilogue Predication ; CHECK-NEXT: LV: Found trip count: 0 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.