diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2573,6 +2573,12 @@ Value *createIsFPClass(Value *FPNum, unsigned Test); + /// Return an all true boolean vector of size and scalability \p NumElts. + Value *getTrueVector(ElementCount NumElts) { + VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts); + return Constant::getAllOnesValue(VTy); + } + private: /// Helper function that creates an assume intrinsic call that /// represents an alignment assumption on the provided pointer \p PtrValue diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -72,6 +72,14 @@ const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); + /// \name Vector Predication Information + /// Whether the target supports the %evl parameter of VP intrinsic efficiently + /// in hardware, for the given opcode and type/alignment. (see LLVM Language + /// Reference - "Vector Predication Intrinsics", + /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics). + bool hasActiveVectorLength(unsigned Opcode, Type *DataType, + Align Alignment) const; + TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth); bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -170,6 +170,10 @@ return TTI::TCC_Free; } +bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { + return ST->hasVInstructions(); +} + TargetTransformInfo::PopcntSupportKind RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -121,6 +121,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/VectorBuilder.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -250,6 +251,44 @@ "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"))); +namespace { +/// Option prefer-predicate-with-vp-intrinsics is a switch to indicate that the +/// loop vectorizer should try to generate VP intrinsics if tail-folding is +/// enabled (note that this option is dependent on the +/// prefer-predicate-over-epilogue option being set to +/// predicate-dont-vectorize). +/// This can be particularly useful for targets like RISC-V and SX-Aurora that +/// support vector length predication. +/// Currently this switch takes three possible values: +/// 0. no-predication: Do not generate VP intrinsics. +/// 1. if-explicit-vector-length-support: Only generate VP intrinsics if the +/// target supports explicit vector length based predication. +/// 2. force-explicit-vector-length-support: It forces the loop vectorizer to +/// assume that the target supports vector length predication. +enum class EVLOption { + NoPredication = 0, + IfEVLSupported, + ForceEVLSupport +}; +} // namespace + +static cl::opt PreferPredicateWithVPEVLIntrinsics( + "prefer-predicate-with-vp-intrinsics", cl::init(EVLOption::NoPredication), + cl::Hidden, + cl::desc("Controls emission of vector predication intrinsics with explicit " + "vector length if tail-folding is forced."), + cl::values( + clEnumValN(EVLOption::NoPredication, "no-predication", + "Do not generate VP intrinsics."), + clEnumValN(EVLOption::IfEVLSupported, + "if-explicit-vector-length-support", + "Only generate VP intrinsics if the target supports vector " + "length predication."), + clEnumValN(EVLOption::ForceEVLSupport, + "force-explicit-vector-length-support", + "Assume that the target supports vector length predication " + "and generate VP intrinsics accordingly."))); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1094,7 +1133,8 @@ isa(CurRec) || isa(CurRec) || isa(CurRec) || - isa(CurRec)) + isa(CurRec) || + isa(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1641,6 +1681,21 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics with explicit vector length support should + /// be generated in the tail folded loop. + bool useVPWithVPEVLVectorization() const { + // TODO: implement support for max safe dependency distance. + return PreferVPWithVPEVLIntrinsics && !EnableVPlanNativePath && + foldTailByMasking() && Legal->isSafeForAnyVectorWidth() && + // FIXME: remove this once vp_reverse is supported. + none_of(WideningDecisions, + [](const std::pair, + std::pair> + &Data) { + return Data.second.first == CM_Widen_Reverse; + }); + } + /// Returns true if the Phi is part of an inloop reduction. bool isInLoopReduction(PHINode *Phi) const { return InLoopReductions.contains(Phi); @@ -1786,6 +1841,10 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool CanFoldTailByMasking = false; + /// Control whether to generate VP intrinsics with explicit-vector-length + /// support in vectorized code. + bool PreferVPWithVPEVLIntrinsics = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -4995,6 +5054,42 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { CanFoldTailByMasking = true; + if (PreferPredicateWithVPEVLIntrinsics == EVLOption::NoPredication) + return MaxFactors; + + if (UserIC > 1) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "not generate VP intrinsics since interleave count " + "specified is greater than 1.\n"); + return MaxFactors; + } + + if (MaxFactors.ScalableVF.isScalable() && + MaxFactors.ScalableVF.isNonZero()) { + if (PreferPredicateWithVPEVLIntrinsics == EVLOption::IfEVLSupported) { + // FIXME: use actual opcode/data type for analysis here. + PreferVPWithVPEVLIntrinsics = + TTI.hasActiveVectorLength(0, nullptr, Align()); + if (PreferVPWithVPEVLIntrinsics) + LLVM_DEBUG(dbgs() + << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics if the target " + "support vector length predication.\n"); + else + LLVM_DEBUG(dbgs() + << "LV: Preference for VP intrinsics indicated. Will " + "not try to generate VP Intrinsics since the target " + "does not support vector length predication.\n"); + } else { + PreferVPWithVPEVLIntrinsics = true; + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics.\n"); + } + + if (PreferVPWithVPEVLIntrinsics) + MaxFactors.FixedVF = ElementCount::getFixed(1); + } + return MaxFactors; } @@ -5603,6 +5698,10 @@ if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if EVL is preferred and no User IC is specified. + if (useVPWithVPEVLVectorization()) + return 1; + // We used the distance for the interleave count. if (!Legal->isSafeForAnyVectorWidth()) return 1; @@ -8667,7 +8766,10 @@ VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { // Now optimize the initial VPlan. - VPlanTransforms::optimize(*Plan, *PSE.getSE()); + VPlanTransforms::optimize(*Plan, *PSE.getSE(), + CM.useVPWithVPEVLVectorization()); + if (CM.useVPWithVPEVLVectorization()) + VPlanTransforms::addExplicitVectorLength(*Plan); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } @@ -8961,7 +9063,7 @@ if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) return nullptr; - if (useActiveLaneMask(Style)) { + if (!CM.useVPWithVPEVLVectorization() && useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once // TailFoldingStyle is visible there. bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); @@ -9493,6 +9595,51 @@ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } +/// Creates either vp_store or vp_scatter intrinsics calls to represent +/// predicated store/scatter. +static Instruction * +lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, + Value *StoredVal, bool IsScatter, Value *Mask, + Value *EVLPart, const Align &Alignment) { + CallInst *Call; + if (IsScatter) { + Call = Builder.CreateIntrinsic(Type::getVoidTy(EVLPart->getContext()), + Intrinsic::vp_scatter, + {StoredVal, Addr, Mask, EVLPart}); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVLPart).setMask(Mask); + Call = cast(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVLPart->getContext()), + {StoredVal, Addr})); + } + Call->addParamAttr( + 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); + return Call; +} + +/// Creates either vp_load or vp_gather intrinsics calls to represent +/// predicated load/gather. +static Instruction * +lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, VectorType *DataTy, + Value *Addr, bool IsGather, Value *Mask, + Value *EVLPart, const Align &Alignment) { + CallInst *Call; + if (IsGather) { + Call = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, + {Addr, Mask, EVLPart}, nullptr, + "wide.masked.gather"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVLPart).setMask(Mask); + Call = cast(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); + } + Call->addParamAttr( + 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); + return Call; +} + void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; @@ -9558,6 +9705,12 @@ return PartPtr; }; + auto MaskValue = [&](unsigned Part) -> Value * { + if (isMaskRequired) + return BlockInMaskParts[Part]; + return nullptr; + }; + // Handle Stores: if (SI) { State.setDebugLocFrom(SI->getDebugLoc()); @@ -9565,7 +9718,21 @@ for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { + if (Value *EVLPart = State.EVL ? State.get(State.EVL, Part) : nullptr) { + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL + // is created only if TTI prefers predicated vectorization, thus if EVL + // is not nullptr it also implies preference for predicated + // vectorization. + // FIXME: Better support reverse store after vp_reverse is added. + NewSI = lowerStoreUsingVectorIntrinsics( + Builder, + CreateGatherScatter + ? State.get(getAddr(), Part) + : CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))), + StoredVal, CreateGatherScatter, MaskValue(Part), EVLPart, + Alignment); + } else if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, @@ -9596,7 +9763,20 @@ State.setDebugLocFrom(LI->getDebugLoc()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; - if (CreateGatherScatter) { + if (Value *EVLPart = State.EVL ? State.get(State.EVL, Part) : nullptr) { + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL + // is created only if TTI prefers predicated vectorization, thus if EVL + // is not nullptr it also implies preference for predicated + // vectorization. + // FIXME: Better support reverse loading after vp_reverse is added. + NewLI = lowerLoadUsingVectorIntrinsics( + Builder, DataTy, + CreateGatherScatter + ? State.get(getAddr(), Part) + : CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))), + CreateGatherScatter, MaskValue(Part), EVLPart, Alignment); + } else if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -242,6 +242,12 @@ ElementCount VF; unsigned UF; + /// If EVL is not nullptr, then EVL must be a valid value set during plan + /// creation, possibly a default value = whole vector register length. EVL is + /// created only if TTI prefers predicated vectorization, thus if EVL is + /// not nullptr it also implies preference for predicated vectorization. + VPValue *EVL = nullptr; + /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. @@ -1032,6 +1038,8 @@ SLPLoad, SLPStore, ActiveLaneMask, + ExplicitVectorLength, + ExplicitVectorLengthIVIncrement, CalculateTripCountMinusVF, CanonicalIVIncrement, // The next op is similar to the above, but instead increment the @@ -1142,6 +1150,8 @@ default: return false; case VPInstruction::ActiveLaneMask: + case VPInstruction::ExplicitVectorLength: + case VPInstruction::ExplicitVectorLengthIVIncrement: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementForPart: @@ -2132,6 +2142,39 @@ #endif }; +/// A recipe for generating the phi node for the current index of elements, +/// adjusted in accordance with EVL value. It starts at StartIV value and gets +/// incremented by EVL in each iteration of the vector loop. +class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { +public: + VPEVLBasedIVPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartMask, DL) {} + + ~VPEVLBasedIVPHIRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC) + + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC; + } + + /// Generate phi for handling IV based on EVL over iterations correctly. + void execute(VPTransformState &State) override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -335,6 +335,44 @@ Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); return Builder.CreateSelect(Cmp, Sub, Zero); } + case VPInstruction::ExplicitVectorLength: { + // Set EVL + auto GetSetVL = [=](VPTransformState &State, Value *EVL) { + assert(EVL->getType()->isIntegerTy() && + "Requested vector length should be an integer."); + + // TODO: Add support for MaxSafeDist for correct loop emission. + Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); + + Value *GVL = State.Builder.CreateIntrinsic( + State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, + {EVL, VFArg, State.Builder.getInt1(State.VF.isScalable())}); + return GVL; + }; + // TODO: Restructure this code with an explicit remainder loop, vsetvli can + // be outside of the main loop. + assert(State.UF == 1 && + "No unrolling expected for predicated vectorization."); + // Compute VTC - IV as the EVL(requested vector length). + Value *IV = State.get(getOperand(0), 0); + Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); + Value *EVL = State.Builder.CreateSub(TripCount, IV); + Value *SetVL = GetSetVL(State, EVL); + State.EVL = this; + return SetVL; + } + case VPInstruction::ExplicitVectorLengthIVIncrement: { + assert(State.UF == 1 && Part == 0 && + "Expected unroll factor 1 for VP vectorization."); + Value *Phi = State.get(getOperand(0), 0); + Value *EVL = State.get(getOperand(1), 0); + assert(EVL->getType()->getScalarSizeInBits() <= + Phi->getType()->getScalarSizeInBits() && + "EVL type must be smaller than Phi type."); + EVL = Builder.CreateIntCast(EVL, Phi->getType(), /*isSigned=*/false); + return Builder.CreateAdd(Phi, EVL, Name, hasNoUnsignedWrap(), + hasNoSignedWrap()); + } case VPInstruction::CanonicalIVIncrement: { if (Part == 0) { auto *Phi = State.get(getOperand(0), 0); @@ -465,6 +503,12 @@ case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; + case VPInstruction::ExplicitVectorLength: + O << "EXPLICIT-VECTOR-LENGTH"; + break; + case VPInstruction::ExplicitVectorLengthIVIncrement: + O << "EXPLICIT-VECTOR-LENGTH +"; + break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; @@ -1690,3 +1734,25 @@ printOperands(O, SlotTracker); } #endif + +void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); + Value *Start = State.get(getOperand(0), VPIteration(0, 0)); + PHINode *EntryPart = + State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); + EntryPart->addIncoming(Start, VectorPH); + EntryPart->setDebugLoc(getDebugLoc()); + State.set(this, EntryPart, 0); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPEVLBasedIVPHIRecipe::print( + raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -59,7 +59,8 @@ /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe /// optimizations, dead recipe removal, replicate region optimizations and /// block merging. - static void optimize(VPlan &Plan, ScalarEvolution &SE); + static void optimize(VPlan &Plan, ScalarEvolution &SE, + bool KeepVPCanonicalWidenRecipes); /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then /// region block and remove the mask operand. Optimize the created regions by @@ -79,6 +80,13 @@ bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck); + /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and + /// replaces all uses except the canonical IV increment of + /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. + /// VPCanonicalIVPHIRecipe is only used to control the loop after + /// this transformation. + static void addExplicitVectorLength(VPlan &Plan); + private: /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. @@ -94,7 +102,8 @@ /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV /// recipe, if it exists. - static void removeRedundantCanonicalIVs(VPlan &Plan); + static void removeRedundantCanonicalIVs(VPlan &Plan, + bool KeepVPCanonicalWidenRecipes); static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -439,7 +439,10 @@ } } -void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { +void VPlanTransforms::removeRedundantCanonicalIVs( + VPlan &Plan, bool KeepVPCanonicalWidenRecipes) { + if (KeepVPCanonicalWidenRecipes) + return; VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; for (VPUser *U : CanonicalIV->users()) { @@ -869,8 +872,9 @@ } } -void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { - removeRedundantCanonicalIVs(Plan); +void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE, + bool KeepVPCanonicalWidenRecipes) { + removeRedundantCanonicalIVs(Plan, KeepVPCanonicalWidenRecipes); removeRedundantInductionCasts(Plan); optimizeInductions(Plan, SE); @@ -987,6 +991,59 @@ return LaneMaskPhi; } +/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using +/// the given idion \p Idiom. +static void replaceHeaderPredicateWithIdiom(VPlan &Plan, VPValue &Idiom, + bool OnlyWidenMemRecipes = false) { + auto *FoundWidenCanonicalIVUser = + find_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa(U); }); + assert(FoundWidenCanonicalIVUser && + "Must have widened canonical IV when tail folding!"); + auto *WideCanonicalIV = + cast(*FoundWidenCanonicalIVUser); + // Walk users of WideCanonicalIV and replace all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with + // the given idiom VPValue. + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + for (VPUser *U : SmallVector(WideCanonicalIV->users())) { + auto *CompareToReplace = dyn_cast(U); + if (!CompareToReplace || + CompareToReplace->getOpcode() != Instruction::ICmp || + CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || + CompareToReplace->getOperand(1) != BTC) + continue; + + assert(CompareToReplace->getOperand(0) == WideCanonicalIV && + "WidenCanonicalIV must be the first operand of the compare"); + if (OnlyWidenMemRecipes) { + for (unsigned J = 0; J < CompareToReplace->getNumUsers();) { + VPUser *User = CompareToReplace->user_begin()[J]; + unsigned NumUsers = CompareToReplace->getNumUsers(); + if (!isa(User)) { + ++J; + continue; + } + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) + if (User->getOperand(I) == CompareToReplace) + User->setOperand(I, &Idiom); + // If a user got removed after updating the current user, the next user + // to update will be moved to the current position, so we only need to + // increment the index if the number of users did not change. + if (NumUsers == CompareToReplace->getNumUsers()) + J++; + } + if (CompareToReplace->getNumUsers() == 0) + CompareToReplace->eraseFromParent(); + } else { + CompareToReplace->replaceAllUsesWith(&Idiom); + CompareToReplace->eraseFromParent(); + } + } + if (!WideCanonicalIV->getNumUsers()) + WideCanonicalIV->eraseFromParent(); +} + void VPlanTransforms::addActiveLaneMask( VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck) { @@ -1016,18 +1073,63 @@ // Walk users of WideCanonicalIV and replace all compares of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an // active-lane-mask. - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - for (VPUser *U : SmallVector(WideCanonicalIV->users())) { - auto *CompareToReplace = dyn_cast(U); - if (!CompareToReplace || - CompareToReplace->getOpcode() != Instruction::ICmp || - CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || - CompareToReplace->getOperand(1) != BTC) - continue; + replaceHeaderPredicateWithIdiom(Plan, *LaneMask->getVPSingleValue()); +} - assert(CompareToReplace->getOperand(0) == WideCanonicalIV && - "WidenCanonicalIV must be the first operand of the compare"); - CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue()); - CompareToReplace->eraseFromParent(); - } +// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and +// replaces all uses except the canonical IV increment of VPCanonicalIVPHIRecipe +// with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe is used only +// for loop iterations counting after this transformation. +// +// The function uses the following definitions: +// %StartV is the canonical induction start value. +// +// The function adds the following recipes: +// +// vector.ph: +// ... +// +// vector.body: +// ... +// %P = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], [ %NextEVL, +// %vector.body ] +// %EVL = EXPLICIT-VECTOR-LENGTH %P, original TC +// ... +// %NextEVL = EXPLICIT-VECTOR-LENGTH + %P, %EVL +// ... +// +void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { + VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto *CanonicalIVPHI = Plan.getCanonicalIV(); + VPValue *StartV = CanonicalIVPHI->getStartValue(); + + // Walk users of WideCanonicalIV and replace all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an + // all-true-mask. + Value *TrueMask = + ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext()); + VPValue *VPTrueMask = Plan.getVPValueOrAddLiveIn(TrueMask); + replaceHeaderPredicateWithIdiom(Plan, *VPTrueMask, + /*OnlyWidenMemRecipes=*/true); + // Now create the ExplicitVectorLengthPhi recipe in the main loop. + auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); + EVLPhi->insertBefore(*Header, Header->getFirstNonPhi()); + auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, + {EVLPhi, Plan.getTripCount()}); + VPEVL->insertBefore(*Header, Header->getFirstNonPhi()); + + auto *CanonicalIVIncrement = + cast(CanonicalIVPHI->getBackedgeValue()); + auto *NextEVLIV = new VPInstruction( + VPInstruction::ExplicitVectorLengthIVIncrement, {EVLPhi, VPEVL}, + {CanonicalIVIncrement->hasNoUnsignedWrap(), + CanonicalIVIncrement->hasNoSignedWrap()}, + CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); + NextEVLIV->insertBefore(CanonicalIVIncrement); + EVLPhi->addOperand(NextEVLIV); + + // Replace all uses of VPCanonicalIVPHIRecipe by + // VPEVLBasedIVPHIRecipe except for VPInstruction::CanonicalIVIncrement. + CanonicalIVPHI->replaceAllUsesWith(EVLPhi); + CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -357,6 +357,7 @@ // VPHeaderPHIRecipe need to be kept together. VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, + VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -202,7 +202,58 @@ for (const VPRecipeBase &R : *VPBB) RecipeNumbering[&R] = Cnt++; + // Check if EVL recipes exist only in Entry block and only once. + DenseSet EVLFound; + const VPBlockBase *Header = nullptr; + const VPBlockBase *Exit = nullptr; + const VPlan *Plan = VPBB->getPlan(); + if (Plan && Plan->getEntry()->getNumSuccessors() == 1) { + Header = Plan->getVectorLoopRegion()->getEntry(); + Exit = Plan->getVectorLoopRegion()->getExiting(); + } + auto CheckEVLRecipiesInsts = [&](const VPRecipeBase *R) { + if (isa(R)) { + if (!Header || VPBB != Header) { + errs() << "EVL PHI recipe not in entry block!\n"; + return false; + } + if (EVLFound.contains(VPDef::VPEVLBasedIVPHISC)) { + errs() << "EVL PHI recipe inserted more than once!\n"; + return false; + } + EVLFound.insert(VPDef::VPEVLBasedIVPHISC); + return true; + } + auto *RInst = dyn_cast(R); + if (!RInst) + return true; + switch (RInst->getOpcode()) { + case VPInstruction::ExplicitVectorLength: + if (!Header || VPBB != Header) { + errs() << "EVL instruction not in entry block!\n"; + return false; + } + break; + case VPInstruction::ExplicitVectorLengthIVIncrement: + if (!Exit || VPBB != Exit) { + errs() << "EVL inc instruction not in exit block!\n"; + return false; + } + break; + default: + return true; + } + if (EVLFound.contains(RInst->getOpcode() + VPDef::VPLastPHISC)) { + errs() << "EVL instruction inserted more than once!\n"; + return false; + } + EVLFound.insert(RInst->getOpcode() + VPDef::VPLastPHISC); + return true; + }; + for (const VPRecipeBase &R : *VPBB) { + if (!CheckEVLRecipiesInsts(&R)) + return false; for (const VPValue *V : R.definedValues()) { for (const VPUser *U : V->users()) { auto *UI = dyn_cast(U); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll @@ -0,0 +1,1300 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL-LABEL: @foo( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP19]] +; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] +; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]] +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @foo( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; FORCE-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; FORCE-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; FORCE-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; FORCE-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; FORCE-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; FORCE-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; FORCE-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP16:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] +; FORCE-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP16]], ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP10]] to i64 +; FORCE-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP19]] +; FORCE-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] +; FORCE-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; FORCE-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; FORCE-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]] +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @foo( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; NO-VP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; NO-VP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; NO-VP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP14:%.*]] = add nsw [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP14]], ptr [[TMP16]], i32 4, [[ACTIVE_LANE_MASK]]) +; NO-VP-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]] +; NO-VP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NO-VP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NO-VP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP20]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) { +; IF-EVL-LABEL: @iv32( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i32 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i32 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP11]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP_LOAD]], ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[EVL_BASED_IV]], [[TMP10]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] +; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] +; IF-EVL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; IF-EVL-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @iv32( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = sub i32 -1, [[N:%.*]] +; FORCE-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; FORCE-EVL-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], 4 +; FORCE-EVL-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] +; FORCE-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; FORCE-EVL-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; FORCE-EVL-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP9:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] +; FORCE-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP9]], i32 4, i1 true) +; FORCE-EVL-NEXT: [[TMP11:%.*]] = add i32 [[EVL_BASED_IV]], 0 +; FORCE-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP_LOAD]], ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[EVL_BASED_IV]], [[TMP10]] +; FORCE-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() +; FORCE-EVL-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 4 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] +; FORCE-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] +; FORCE-EVL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; FORCE-EVL-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @iv32( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = sub i32 -1, [[N:%.*]] +; NO-VP-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], 4 +; NO-VP-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] +; NO-VP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 0 +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP9]], i32 [[N]]) +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP9]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP9]] +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; NO-VP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] +; NO-VP-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] +; NO-VP-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; NO-VP-NEXT: store i32 [[TMP17]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i32 %iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 %iv + store i32 %0, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { +; IF-EVL-LABEL: @masked_loadstore( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP13]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP17]], zeroinitializer +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP19]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP21:%.*]] = add [[VP_OP_LOAD]], [[VP_OP_LOAD3]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP21]], ptr align 4 [[TMP20]], [[TMP19]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP22]] +; IF-EVL-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]] +; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] +; IF-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0 +; IF-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; IF-EVL: if.then: +; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] +; IF-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 +; IF-EVL-NEXT: br label [[FOR_INC]] +; IF-EVL: for.inc: +; IF-EVL-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL: exit: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @masked_loadstore( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; FORCE-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; FORCE-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; FORCE-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; FORCE-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; FORCE-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; FORCE-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; FORCE-EVL-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; FORCE-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; FORCE-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP13]] +; FORCE-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] +; FORCE-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP17:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer +; FORCE-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]] +; FORCE-EVL-NEXT: [[TMP19:%.*]] = select [[TMP14]], [[TMP17]], zeroinitializer +; FORCE-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP19]], i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP21:%.*]] = add [[VP_OP_LOAD]], [[VP_OP_LOAD3]] +; FORCE-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP21]], ptr align 4 [[TMP20]], [[TMP19]], i32 [[TMP10]]) +; FORCE-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP10]] to i64 +; FORCE-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP22]] +; FORCE-EVL-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]] +; FORCE-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] +; FORCE-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0 +; FORCE-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; FORCE-EVL: if.then: +; FORCE-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] +; FORCE-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 +; FORCE-EVL-NEXT: br label [[FOR_INC]] +; FORCE-EVL: for.inc: +; FORCE-EVL-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; FORCE-EVL: exit: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @masked_loadstore( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; NO-VP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; NO-VP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; NO-VP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP12:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, [[TMP14]], poison) +; NO-VP-NEXT: [[TMP16:%.*]] = add [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP16]], ptr [[TMP15]], i32 4, [[TMP14]]) +; NO-VP-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]] +; NO-VP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] +; NO-VP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP20]], 0 +; NO-VP-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; NO-VP: if.then: +; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] +; NO-VP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], [[TMP21]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 +; NO-VP-NEXT: br label [[FOR_INC]] +; NO-VP: for.inc: +; NO-VP-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP: exit: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.011 + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp ne i32 %0, 0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %i.011 + %1 = load i32, ptr %arrayidx3, align 4 + %add = add i32 %0, %1 + store i32 %add, ptr %arrayidx3, align 4 + br label %for.inc + +for.inc: + %inc = add nuw nsw i64 %i.011, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) { +; IF-EVL-LABEL: @gather_scatter( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = add [[TMP9]], zeroinitializer +; IF-EVL-NEXT: [[TMP11:%.*]] = mul [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 1, [[TMP13]] +; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 +; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP15]], i32 2, i1 true) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP16]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP16]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] +; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP16]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP16]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP20]] +; IF-EVL-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP22]] +; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; IF-EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP24:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 +; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP24]] +; IF-EVL-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP24]] +; IF-EVL-NEXT: store float [[TMP25]], ptr [[ARRAYIDX7]], align 4 +; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @gather_scatter( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; FORCE-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; FORCE-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; FORCE-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; FORCE-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; FORCE-EVL-NEXT: [[TMP10:%.*]] = add [[TMP9]], zeroinitializer +; FORCE-EVL-NEXT: [[TMP11:%.*]] = mul [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; FORCE-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; FORCE-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; FORCE-EVL-NEXT: [[TMP14:%.*]] = mul i64 1, [[TMP13]] +; FORCE-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 +; FORCE-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; FORCE-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP15]], i32 2, i1 true) +; FORCE-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] +; FORCE-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP16]]) +; FORCE-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] +; FORCE-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP16]]) +; FORCE-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] +; FORCE-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP16]]) +; FORCE-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP16]] to i64 +; FORCE-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP20]] +; FORCE-EVL-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP22]] +; FORCE-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; FORCE-EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP24:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 +; FORCE-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP24]] +; FORCE-EVL-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP24]] +; FORCE-EVL-NEXT: store float [[TMP25]], ptr [[ARRAYIDX7]], align 4 +; FORCE-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; FORCE-EVL: for.end: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @gather_scatter( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; NO-VP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; NO-VP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; NO-VP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; NO-VP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; NO-VP-NEXT: [[TMP10:%.*]] = add [[TMP9]], zeroinitializer +; NO-VP-NEXT: [[TMP11:%.*]] = mul [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; NO-VP-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; NO-VP-NEXT: [[TMP14:%.*]] = mul i64 1, [[TMP13]] +; NO-VP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 +; NO-VP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP15:%.*]] = add i64 [[INDEX1]], 0 +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP15]], i64 [[N]]) +; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] +; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] +; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0( [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] +; NO-VP-NEXT: call void @llvm.masked.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], [[TMP18]], i32 4, [[ACTIVE_LANE_MASK]]) +; NO-VP-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP20]] +; NO-VP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; NO-VP-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP22:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 +; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP22]] +; NO-VP-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 +; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP22]] +; NO-VP-NEXT: store float [[TMP23]], ptr [[ARRAYIDX7]], align 4 +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv + %0 = load i64, ptr %arrayidx3, align 8 + %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %0 + %1 = load float, ptr %arrayidx5, align 4 + %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %0 + store float %1, ptr %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) { +; IF-EVL-LABEL: @reverse_load_store( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] +; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]] +; IF-EVL-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = add zeroinitializer, [[TMP6]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP7]] +; IF-EVL-NEXT: [[TMP8:%.*]] = extractelement [[VEC_IV]], i32 0 +; IF-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 1024) +; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[TMP5]], -1 +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = sub i64 1, [[TMP12]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP14]] +; IF-EVL-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv4i1( [[ACTIVE_LANE_MASK]]) +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[REVERSE]], poison) +; IF-EVL-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[WIDE_MASKED_LOAD]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[REVERSE4:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[REVERSE3]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]] +; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP20]] +; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP21]] +; IF-EVL-NEXT: [[REVERSE5:%.*]] = call @llvm.experimental.vector.reverse.nxv4i1( [[ACTIVE_LANE_MASK]]) +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[REVERSE4]], ptr [[TMP23]], i32 4, [[REVERSE5]]) +; IF-EVL-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]] +; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 +; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] +; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 +; IF-EVL-NEXT: [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]] +; IF-EVL-NEXT: store i32 [[TMP]], ptr [[GEPS]], align 4 +; IF-EVL-NEXT: [[INC]] = add i32 [[I]], 1 +; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024 +; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL: loopend: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @reverse_load_store( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; FORCE-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; FORCE-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] +; FORCE-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]] +; FORCE-EVL-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = add zeroinitializer, [[TMP6]] +; FORCE-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP7]] +; FORCE-EVL-NEXT: [[TMP8:%.*]] = extractelement [[VEC_IV]], i32 0 +; FORCE-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 1024) +; FORCE-EVL-NEXT: [[TMP9:%.*]] = add i64 [[TMP5]], -1 +; FORCE-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP9]] +; FORCE-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; FORCE-EVL-NEXT: [[TMP13:%.*]] = mul i64 0, [[TMP12]] +; FORCE-EVL-NEXT: [[TMP14:%.*]] = sub i64 1, [[TMP12]] +; FORCE-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP13]] +; FORCE-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP14]] +; FORCE-EVL-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv4i1( [[ACTIVE_LANE_MASK]]) +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[REVERSE]], poison) +; FORCE-EVL-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[WIDE_MASKED_LOAD]]) +; FORCE-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP9]] +; FORCE-EVL-NEXT: [[REVERSE4:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[REVERSE3]]) +; FORCE-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; FORCE-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]] +; FORCE-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]] +; FORCE-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP20]] +; FORCE-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP21]] +; FORCE-EVL-NEXT: [[REVERSE5:%.*]] = call @llvm.experimental.vector.reverse.nxv4i1( [[ACTIVE_LANE_MASK]]) +; FORCE-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[REVERSE4]], ptr [[TMP23]], i32 4, [[REVERSE5]]) +; FORCE-EVL-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]] +; FORCE-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 +; FORCE-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] +; FORCE-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 +; FORCE-EVL-NEXT: [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]] +; FORCE-EVL-NEXT: store i32 [[TMP]], ptr [[GEPS]], align 4 +; FORCE-EVL-NEXT: [[INC]] = add i32 [[I]], 1 +; FORCE-EVL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024 +; FORCE-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP11:![0-9]+]] +; FORCE-EVL: loopend: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @reverse_load_store( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], 1024 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; NO-VP-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; NO-VP-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[VEC_IV]], i32 0 +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[TMP1]], i64 1024) +; NO-VP-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], -1 +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP2]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -7 +; NO-VP-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i1> poison, <8 x i32> +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[REVERSE]], <8 x i32> poison) +; NO-VP-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i32> [[WIDE_MASKED_LOAD]], <8 x i32> poison, <8 x i32> +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP2]] +; NO-VP-NEXT: [[REVERSE3:%.*]] = shufflevector <8 x i32> [[REVERSE2]], <8 x i32> poison, <8 x i32> +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 -7 +; NO-VP-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[REVERSE3]], ptr [[TMP8]], i32 4, <8 x i1> [[REVERSE]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 +; NO-VP-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] +; NO-VP-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 +; NO-VP-NEXT: [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]] +; NO-VP-NEXT: store i32 [[TMP]], ptr [[GEPS]], align 4 +; NO-VP-NEXT: [[INC]] = add i32 [[I]], 1 +; NO-VP-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024 +; NO-VP-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP11:![0-9]+]] +; NO-VP: loopend: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %add.phi = phi i64 [ %startval, %entry ], [ %add, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %add = add i64 %add.phi, -1 + %gepl = getelementptr inbounds i32, ptr %ptr, i64 %add + %tmp = load i32, ptr %gepl, align 4 + %geps = getelementptr inbounds i32, ptr %ptr2, i64 %add + store i32 %tmp, ptr %geps, align 4 + %inc = add i32 %i, 1 + %exitcond = icmp ne i32 %inc, 1024 + br i1 %exitcond, label %for.body, label %loopend + +loopend: + ret void +} +define void @interleave(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL-LABEL: @interleave( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; IF-EVL-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; IF-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) +; IF-EVL-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[N]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP19]] +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; IF-EVL-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP26]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; IF-EVL-NEXT: [[TMP27:%.*]] = add nsw [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; IF-EVL-NEXT: [[TMP28:%.*]] = add nsw [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]] +; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 0 +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP27]], ptr [[TMP31]], i32 4, [[ACTIVE_LANE_MASK]]) +; IF-EVL-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 +; IF-EVL-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP33]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP28]], ptr [[TMP34]], i32 4, [[ACTIVE_LANE_MASK1]]) +; IF-EVL-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 8 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP36]] +; IF-EVL-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP39]], [[TMP38]] +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @interleave( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; FORCE-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; FORCE-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; FORCE-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; FORCE-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; FORCE-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; FORCE-EVL-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; FORCE-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; FORCE-EVL-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; FORCE-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) +; FORCE-EVL-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[N]]) +; FORCE-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP9]] +; FORCE-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] +; FORCE-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; FORCE-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; FORCE-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP19]] +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; FORCE-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP9]] +; FORCE-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP14]] +; FORCE-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; FORCE-EVL-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; FORCE-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP26]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; FORCE-EVL-NEXT: [[TMP27:%.*]] = add nsw [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; FORCE-EVL-NEXT: [[TMP28:%.*]] = add nsw [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]] +; FORCE-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] +; FORCE-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; FORCE-EVL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP27]], ptr [[TMP31]], i32 4, [[ACTIVE_LANE_MASK]]) +; FORCE-EVL-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 +; FORCE-EVL-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP33]] +; FORCE-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP28]], ptr [[TMP34]], i32 4, [[ACTIVE_LANE_MASK1]]) +; FORCE-EVL-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 8 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP36]] +; FORCE-EVL-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; FORCE-EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; FORCE-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP39]], [[TMP38]] +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @interleave( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; NO-VP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; NO-VP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; NO-VP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; NO-VP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; NO-VP-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; NO-VP-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; NO-VP-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[N]]) +; NO-VP-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP14]], i64 [[N]]) +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] +; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; NO-VP-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP19]] +; NO-VP-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; NO-VP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP14]] +; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; NO-VP-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; NO-VP-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP26]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; NO-VP-NEXT: [[TMP27:%.*]] = add nsw [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP28:%.*]] = add nsw [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]] +; NO-VP-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] +; NO-VP-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP27]], ptr [[TMP31]], i32 4, [[ACTIVE_LANE_MASK]]) +; NO-VP-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 +; NO-VP-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i64 [[TMP33]] +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP28]], ptr [[TMP34]], i32 4, [[ACTIVE_LANE_MASK1]]) +; NO-VP-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 8 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP36]] +; NO-VP-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NO-VP-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NO-VP-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP39]], [[TMP38]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 + +for.cond.cleanup: + ret void +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.interleave.count", i32 2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -0,0 +1,154 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=FORCE-EVL,CHECK %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: Live-in ir<%N> = original trip-count +; IF-EVL-EMPTY: +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop +; IF-EVL-EMPTY: +; IF-EVL-NEXT: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N> +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, ir +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, ir +; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, ir +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = EXPLICIT-VECTOR-LENGTH + vp<[[EVL_PHI]]>, vp<[[EVL]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } + +; FORCE-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; FORCE-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; FORCE-EVL-NEXT: Live-in ir<%N> = original trip-count +; FORCE-EVL-EMPTY: +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: Successor(s): vector loop +; FORCE-EVL-EMPTY: +; FORCE-EVL-NEXT: vector loop: { +; FORCE-EVL-NEXT: vector.body: +; FORCE-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; FORCE-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; FORCE-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N> +; FORCE-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; FORCE-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, ir +; FORCE-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, ir +; FORCE-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, ir +; FORCE-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = EXPLICIT-VECTOR-LENGTH + vp<[[EVL_PHI]]>, vp<[[EVL]]> +; FORCE-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; FORCE-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; FORCE-EVL-NEXT: No successors +; FORCE-EVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-NEXT: Live-in ir<%N> = original trip-count +; NO-VP-EMPTY: +; NO-VP: vector.ph: +; NO-VP-NEXT: Successor(s): vector loop +; NO-VP-EMPTY: +; NO-VP-NEXT: vector loop: { +; NO-VP-NEXT: vector.body: +; NO-VP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-NEXT: EMIT vp<[[MASK:%.+]]> = active lane mask vp<[[ST]]>, ir<%N> +; NO-VP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]> +; NO-VP-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]> +; NO-VP-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @safe_dep(ptr %p) { +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<512> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]> +; CHECK-NEXT: WIDEN ir<[[V:%.+]]> = load ir<[[GEP1]]> +; CHECK-NEXT: CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100> +; CHECK-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]> +; CHECK-NEXT: WIDEN store ir<[[GEP2]]>, ir<[[V]]> +; CHECK-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + nuw vp<[[IV]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +entry: + br label %loop + +loop: + %iv = phi i64 [0, %entry], [%iv.next, %loop] + %a1 = getelementptr i64, ptr %p, i64 %iv + %v = load i64, ptr %a1, align 32 + %offset = add i64 %iv, 100 + %a2 = getelementptr i64, ptr %p, i64 %offset + store i64 %v, ptr %a2, align 32 + %iv.next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv, 511 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL-LABEL: @foo( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], +; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; IF-EVL-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; +; FORCE-EVL-LABEL: @foo( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15 +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; FORCE-EVL-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], +; FORCE-EVL-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; FORCE-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; FORCE-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; FORCE-EVL-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; FORCE-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]]) +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; FORCE-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; FORCE-EVL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; FORCE-EVL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; +; NO-VP-LABEL: @foo( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; NO-VP-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], +; NO-VP-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NO-VP-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll @@ -0,0 +1,115 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count +; IF-EVL-NEXT: Live-in ir<%N> = original trip-count +; IF-EVL-EMPTY: +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop +; IF-EVL-EMPTY: +; IF-EVL-NEXT: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; IF-EVL-NEXT: EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]> +; IF-EVL-NEXT: EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]> +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]> +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]> +; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } + +; FORCE-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; FORCE-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; FORCE-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count +; FORCE-EVL-NEXT: Live-in ir<%N> = original trip-count +; FORCE-EVL-EMPTY: +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: Successor(s): vector loop +; FORCE-EVL-EMPTY: +; FORCE-EVL-NEXT: vector loop: { +; FORCE-EVL-NEXT: vector.body: +; FORCE-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; FORCE-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; FORCE-EVL-NEXT: EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]> +; FORCE-EVL-NEXT: EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]> +; FORCE-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]> +; FORCE-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; FORCE-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; FORCE-EVL-NEXT: No successors +; FORCE-EVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' { +; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count +; NO-VP-NEXT: Live-in ir<%N> = original trip-count +; NO-VP-EMPTY: +; NO-VP: vector.ph: +; NO-VP-NEXT: Successor(s): vector loop +; NO-VP-EMPTY: +; NO-VP-NEXT: vector loop: { +; NO-VP-NEXT: vector.body: +; NO-VP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-NEXT: EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]> +; NO-VP-NEXT: EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]> +; NO-VP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]> +; NO-VP-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]> +; NO-VP-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %add, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +}