diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2512,6 +2512,12 @@ unsigned Index, unsigned FieldIndex, MDNode *DbgInfo); + /// Return an all true boolean vector of size and scalability \p NumElts. + Value *getTrueVector(ElementCount NumElts) { + VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts); + return Constant::getAllOnesValue(VTy); + } + private: /// Helper function that creates an assume intrinsic call that /// represents an alignment assumption on the provided pointer \p PtrValue diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -75,6 +75,14 @@ const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); + /// \name Vector Predication Information + /// Whether the target supports the %evl parameter of VP intrinsic efficiently + /// in hardware, for the given opcode and type/alignment. (see LLVM Language + /// Reference - "Vector Predication Intrinsics", + /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics). + bool hasActiveVectorLength(unsigned Opcode, Type *DataType, + Align Alignment) const; + TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth); bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -185,6 +185,19 @@ return TTI::TCC_Free; } +bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { + if (!DataTy) + return ST->hasVInstructions(); + + if (DataTy->isFloatTy()) + return ST->hasVInstructionsF32(); + if (DataTy->isDoubleTy()) + return ST->hasVInstructionsF64(); + + unsigned IntWidth = DataTy->getIntegerBitWidth(); + return IntWidth <= ST->getELEN(); +} + TargetTransformInfo::PopcntSupportKind RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -249,6 +249,44 @@ "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"))); +// Option prefer-predicate-with-vp-intrinsics is an experimental switch to +// indicate that the loop vectorizer should try to generate VP intrinsics if +// tail-folding is enabled (note that this option is dependent on the +// prefer-predicate-over-epilogue option being set to predicate-dont-vectorize). +// This can be particularly useful for targets like RISC-V and SX-Aurora that +// support vector length predication. +// Currently this switch takes three possible values: +// 0. no-predication: Do not generate VP intrinsics. +// 1. if-explicit-vector-length-supported: Only generate VP intrinsics if the +// target supports explicit vector length based predication. +// 2. force-explicit-vector-length-support: This is purely an +// experimental/testing option which will be removed in future. It forces the +// loop vectorizer to assume that the target supports vector length predication. +namespace { +enum class EVLOption { + NoPredication = 0, + IfEVLSupported, + ForceEVLSupport +}; +} // namespace + +static cl::opt PreferPredicateWithVPIntrinsics( + "prefer-predicate-with-vp-intrinsics", cl::init(EVLOption::NoPredication), + cl::Hidden, + cl::desc("When vectorizing with tail-folding, generate vector predication " + "intrinsics."), + cl::values( + clEnumValN(EVLOption::NoPredication, "no-predication", + "Do not generate VP intrinsics."), + clEnumValN(EVLOption::IfEVLSupported, + "if-explicit-vector-length-support", + "Only generate VP intrinsics if the target supports vector " + "length predication."), + clEnumValN(EVLOption::ForceEVLSupport, + "force-explicit-vector-length-support", + "Assume that the target supports vector length predication " + "and generate VP intrinsics accordingly."))); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1589,6 +1627,11 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics should be generated in the tail folded loop. + bool useVPVectorization() const { + return PreferVPIntrinsics && foldTailByMasking(); + } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -1748,6 +1791,9 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool CanFoldTailByMasking = false; + /// Control whether to generate VP intrinsics in vectorized code. + bool PreferVPIntrinsics = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -2835,6 +2881,19 @@ if (VectorTripCount) return VectorTripCount; + if (Cost->useVPVectorization()) { + Value *TC = getTripCount(); + // Loop has multiple exits. Make sure scalar remainder executes at least 1 + // scalar iteration to perform correct jump. + if (Cost->requiresScalarEpilogue(VF)) { + + IRBuilder<> Builder(InsertBlock->getTerminator()); + TC = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), + "adj.for.rem.tc"); + } + return VectorTripCount = TC; + } + Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); @@ -5153,6 +5212,34 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { CanFoldTailByMasking = true; + if (PreferPredicateWithVPIntrinsics == EVLOption::NoPredication) + return MaxFactors; + + if (UserIC > 1) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "not generate VP intrinsics since interleave count " + "specified is greater than 1.\n"); + return MaxFactors; + } + + if (MaxFactors.ScalableVF.isScalable() && + MaxFactors.ScalableVF.isNonZero()) { + if (PreferPredicateWithVPIntrinsics == EVLOption::IfEVLSupported) { + // FIXME: use actual opcode/data type for analysis here. + PreferVPIntrinsics = TTI.hasActiveVectorLength(0, nullptr, Align()); + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics if the target " + "support vector length predication.\n"); + } else { + PreferVPIntrinsics = true; + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics.\n"); + } + + if (PreferVPIntrinsics) + MaxFactors.FixedVF = ElementCount::getFixed(1); + } + return MaxFactors; } @@ -5725,6 +5812,11 @@ if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if VP intrinsics are preferred and no User IC is + // specified. + if (useVPVectorization()) + return 1; + // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; @@ -8130,7 +8222,7 @@ VPValue *BlockMask = nullptr; if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) + if (!CM.blockNeedsPredicationForAnyReason(BB) || CM.useVPVectorization()) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. assert(CM.foldTailByMasking() && "must fold the tail"); @@ -8689,7 +8781,7 @@ // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - TailFoldingStyle Style) { + TailFoldingStyle Style, bool NeedEVL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); @@ -8699,17 +8791,27 @@ VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + VPEVLRecipe *VPEVL = nullptr; + if (NeedEVL) { + VPEVL = new VPEVLRecipe(Plan.getCanonicalIV(), &Plan.getVectorTripCount()); + Header->appendRecipe(VPEVL); + } + // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. + SmallVector Args(1, CanonicalIVPHI); + if (VPEVL) + Args.push_back(VPEVL); + bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL, "index.next"); + Args, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - if (useActiveLaneMaskForControlFlow(Style)) { + if (!NeedEVL && useActiveLaneMaskForControlFlow(Style)) { // Create the active lane mask instruction in the vplan preheader. VPBasicBlock *VecPreheader = cast(Plan.getVectorLoopRegion()->getSinglePredecessor()); @@ -8893,7 +8995,8 @@ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - CM.getTailFoldingStyle(IVUpdateMayOverflow)); + CM.getTailFoldingStyle(IVUpdateMayOverflow), + CM.useVPVectorization()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8961,7 +9064,11 @@ HeaderVPBB->getFirstNonPhi() != VPBB->end()) { // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the // phi section of HeaderVPBB. - assert(isa(Instr)); + // Do the same if VPEVLRecipe was emitted for predicated + // vectorization support. + // TODO: try to avoid recipes moving by inserting them in proper place. + assert(isa(Instr) || + isa(HeaderVPBB->getFirstNonPhi())); Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else VPBB->appendRecipe(Recipe); @@ -9088,7 +9195,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + CM.getTailFoldingStyle(), CM.useVPVectorization()); return Plan; } @@ -9650,6 +9757,18 @@ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; + auto MaskValue = [&](unsigned Part, ElementCount EC) -> Value * { + VPValue *Mask = getMask(); + assert(Mask && "MaskValue must be called for recipes with set mask"); + // The outermost mask can be lowered as an all ones mask when using + // EVL. + if (auto *IMask = dyn_cast(Mask)) + if (IMask->getOpcode() == VPInstruction::ICmpULE) + return Builder.getTrueVector(EC); + + return BlockInMaskParts[Part]; + }; + // Handle Stores: if (SI) { State.setDebugLocFromInst(SI); @@ -9657,6 +9776,11 @@ for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + Value *EVLPart = State.EVL ? State.get(State.EVL, Part) : nullptr; if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); @@ -9672,11 +9796,27 @@ } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + // if EVLPart is not null, we can vectorize using predicated + // intrinsic. + if (EVLPart) { + auto *StoredValTy = cast(StoredVal->getType()); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Function *VPIntr = Intrinsic::getDeclaration( + VectorPH->getModule(), Intrinsic::vp_store, + {StoredValTy, VecPtr->getType()}); + Value *BlockInMaskPart = + isMaskRequired + ? MaskValue(Part, StoredValTy->getElementCount()) + : Builder.getTrueVector(StoredValTy->getElementCount()); + + NewSI = Builder.CreateCall( + VPIntr, {StoredVal, VecPtr, BlockInMaskPart, EVLPart}); + } else if (isMaskRequired) { NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); - else + } else { NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } } State.addMetadata(NewSI, SI); } @@ -9688,6 +9828,13 @@ State.setDebugLocFromInst(LI); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; + + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + Value *EVLPart = State.EVL ? State.get(State.EVL, Part) : nullptr; + if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); @@ -9697,13 +9844,26 @@ } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (EVLPart) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + Function *VPIntr = + Intrinsic::getDeclaration(VectorPH->getModule(), Intrinsic::vp_load, + {DataTy, VecPtr->getType()}); + + Value *BlockInMaskPart = + isMaskRequired ? MaskValue(Part, DataTy->getElementCount()) + : Builder.getTrueVector(DataTy->getElementCount()); + + NewLI = Builder.CreateCall(VPIntr, {VecPtr, BlockInMaskPart, EVLPart}, + "vp.op.load"); + } else if (isMaskRequired) { NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); - else + } else { NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + } // Add metadata to the load, but setVectorValue to the reverse shuffle. State.addMetadata(NewLI, LI); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -242,6 +242,12 @@ ElementCount VF; unsigned UF; + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + VPValue *EVL = nullptr; + /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. @@ -2049,6 +2055,43 @@ VPValue *Step, Type *Ty) const; }; +/// A recipe to generate Explicit Vector Length (EVL) value to be used with +/// VPred intrinsics. +/// The following three ways to compute the EVL parameter for the VP Intrinsics. +/// +/// 1. The simplest way is to use the VF as EVL and rely solely on the mask +/// parameter to control predication. The mask parameter is the same as computed +/// for current tail-folding implementation. +/// 2. The second way is to insert instructions to compute min(VF, trip_count - +/// index) for each vector iteration. +/// 3. For architectures like RISC-V, which have special instruction to +/// compute/set an explicit vector length, we also introduce an experimental +/// intrinsic set_vector_length, that can be lowered to architecture specific +/// instruction(s) to compute EVL. +class VPEVLRecipe : public VPRecipeBase, public VPValue { +public: + VPEVLRecipe(VPValue *IV, VPValue *TC) + : VPRecipeBase(VPDef::VPEVLSC, {IV, TC}), VPValue(this) {} + ~VPEVLRecipe() override = default; + + /// Return VPValue representing Induction Variable. + VPValue *getIV() const { return getOperand(0); } + + /// Return VPValue representing vector trip count. + VPValue *getVectorTripCount() const { return getOperand(1); } + + VP_CLASSOF_IMPL(VPDef::VPEVLSC) + + /// Generate the instructions to compute EVL. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for generating the active lane mask for the vector loop that is /// used to predicate the vector operations. /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -314,13 +314,22 @@ case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: { Value *Next = nullptr; - if (Part == 0) { + bool HasEVL = getNumOperands() == 2; + Value *EVL = HasEVL ? State.get(getOperand(1), Part) : nullptr; + if (Part == 0 || EVL) { bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; auto *Phi = State.get(getOperand(0), 0); // The loop step is equal to the vectorization factor (num of SIMD // elements) times the unroll factor (num of SIMD instructions). + if (HasEVL) { + EVL = Builder.CreateIntCast(EVL, Phi->getType(), /*isSigned=*/false); + if (State.UF > 1) + EVL = Builder.CreateMul(EVL, + ConstantInt::get(Phi->getType(), State.UF)); + } Value *Step = - createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + HasEVL ? EVL + : createStepForVF(Builder, Phi->getType(), State.VF, State.UF); Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } else { Next = State.get(this, 0); @@ -1137,6 +1146,45 @@ } #endif +void VPEVLRecipe::execute(VPTransformState &State) { + // Set EVL + auto GetSetVL = [=](VPTransformState &State, Value *EVL) { + assert(EVL->getType()->isIntegerTy() && + "Requested vector length should be an integer."); + + // TODO: Add support for MaxSafeDist for correct loop emission. + Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); + + Value *GVL = State.Builder.CreateIntrinsic( + State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, + {EVL, VFArg, State.Builder.getInt1(State.VF.isScalable())}); + return GVL; + }; + // TODO: Restructure this code with an explicit remainder loop, vsetvli can be + // outside of the main loop that allows to use interleave or unroll in the + // main loop. + assert(State.UF < 2 && "Neither unrolling, nor interleaving is supported by " + "predicated vectorization."); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + // Compute TC - IV as the EVL(requested vector length). + Value *IV = State.get(getIV(), Part); + Value *TripCount = State.get(getVectorTripCount(), Part); + Value *EVL = State.Builder.CreateSub(TripCount, IV); + Value *SetVL = GetSetVL(State, EVL); + State.set(this, SetVL, Part); + } + State.EVL = this; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = EXPLICIT-VECTOR-LENGTH"; +} +#endif + bool VPCanonicalIVPHIRecipe::isCanonical( InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step, Type *Ty) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -350,6 +350,7 @@ VPWidenMemoryInstructionSC, VPWidenSC, VPWidenSelectSC, + VPEVLSC, // START: Phi-like recipes. Need to be kept together. VPBlendSC, VPPredInstPHISC, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll @@ -0,0 +1,475 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) { +; IF-EVL-LABEL: @foo( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-EVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-EVL: for.body.preheader: +; IF-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]] +; IF-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP1]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[TMP4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP2]]) +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP2]]) +; IF-EVL-NEXT: [[TMP7:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP7]], ptr [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP2]]) +; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP2]] to i64 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; IF-EVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[WIDE_TRIP_COUNT]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.cond.cleanup.loopexit: +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; IF-EVL: for.body: +; IF-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; FORCE-EVL-LABEL: @foo( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; FORCE-EVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; FORCE-EVL: for.body.preheader: +; FORCE-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; FORCE-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; FORCE-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]] +; FORCE-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP1]], i32 4, i1 true) +; FORCE-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[TMP4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP2]]) +; FORCE-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; FORCE-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP2]]) +; FORCE-EVL-NEXT: [[TMP7:%.*]] = add nsw [[VP_OP_LOAD1]], [[VP_OP_LOAD]] +; FORCE-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP7]], ptr [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP2]]) +; FORCE-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP2]] to i64 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; FORCE-EVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; FORCE-EVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[WIDE_TRIP_COUNT]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.cond.cleanup.loopexit: +; FORCE-EVL-NEXT: br label [[FOR_COND_CLEANUP]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; NO-VP-LABEL: @foo( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; NO-VP-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; NO-VP: for.body.preheader: +; NO-VP-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]]) +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]] +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP10:%.*]] = add nsw [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]] +; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP10]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; NO-VP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.cond.cleanup.loopexit: +; NO-VP-NEXT: br label [[FOR_COND_CLEANUP]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +define void @inteleave(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) { +; IF-EVL-LABEL: @inteleave( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-EVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-EVL: for.body.preheader: +; IF-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 0 +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 1 +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]]) +; IF-EVL-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[WIDE_TRIP_COUNT]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP15]] +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; IF-EVL-NEXT: [[TMP23:%.*]] = add nsw [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; IF-EVL-NEXT: [[TMP24:%.*]] = add nsw [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]] +; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]] +; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0 +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP23]], ptr [[TMP27]], i32 4, [[ACTIVE_LANE_MASK]]) +; IF-EVL-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP24]], ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK1]]) +; IF-EVL-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]] +; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.cond.cleanup.loopexit: +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; IF-EVL: for.body: +; IF-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]] +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; +; FORCE-EVL-LABEL: @inteleave( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; FORCE-EVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; FORCE-EVL: for.body.preheader: +; FORCE-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; FORCE-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; FORCE-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; FORCE-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]] +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; FORCE-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; FORCE-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 0 +; FORCE-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 1 +; FORCE-EVL-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] +; FORCE-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]]) +; FORCE-EVL-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[WIDE_TRIP_COUNT]]) +; FORCE-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]] +; FORCE-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP10]] +; FORCE-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; FORCE-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; FORCE-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP15]] +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; FORCE-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]] +; FORCE-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP10]] +; FORCE-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; FORCE-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; FORCE-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; FORCE-EVL-NEXT: [[TMP23:%.*]] = add nsw [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; FORCE-EVL-NEXT: [[TMP24:%.*]] = add nsw [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]] +; FORCE-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]] +; FORCE-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] +; FORCE-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP23]], ptr [[TMP27]], i32 4, [[ACTIVE_LANE_MASK]]) +; FORCE-EVL-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4 +; FORCE-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] +; FORCE-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP24]], ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK1]]) +; FORCE-EVL-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; FORCE-EVL-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]] +; FORCE-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.cond.cleanup.loopexit: +; FORCE-EVL-NEXT: br label [[FOR_COND_CLEANUP]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]] +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; +; NO-VP-LABEL: @inteleave( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; NO-VP-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; NO-VP: for.body.preheader: +; NO-VP-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; NO-VP-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]] +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 0 +; NO-VP-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 1 +; NO-VP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] +; NO-VP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]]) +; NO-VP-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[WIDE_TRIP_COUNT]]) +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]] +; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP10]] +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP15]] +; NO-VP-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]] +; NO-VP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP10]] +; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; NO-VP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; NO-VP-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 4, [[ACTIVE_LANE_MASK1]], poison) +; NO-VP-NEXT: [[TMP23:%.*]] = add nsw [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP24:%.*]] = add nsw [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]] +; NO-VP-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]] +; NO-VP-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] +; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP23]], ptr [[TMP27]], i32 4, [[ACTIVE_LANE_MASK]]) +; NO-VP-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4 +; NO-VP-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] +; NO-VP-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP24]], ptr [[TMP30]], i32 4, [[ACTIVE_LANE_MASK1]]) +; NO-VP-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8 +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]] +; NO-VP-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.cond.cleanup.loopexit: +; NO-VP-NEXT: br label [[FOR_COND_CLEANUP]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.interleave.count", i32 2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -0,0 +1,117 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -disable-output %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -disable-output %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -disable-output %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) { +; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; IF-EVL-EMPTY: +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop +; IF-EVL-EMPTY: +; IF-EVL-NEXT: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]> +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]> +; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> vp<[[EVL]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]> vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } + +; FORCE-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; FORCE-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; FORCE-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; FORCE-EVL-EMPTY: +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: Successor(s): vector loop +; FORCE-EVL-EMPTY: +; FORCE-EVL-NEXT: vector loop: { +; FORCE-EVL-NEXT: vector.body: +; FORCE-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; FORCE-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; FORCE-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH +; FORCE-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]> +; FORCE-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]> +; FORCE-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> vp<[[EVL]]> +; FORCE-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]> vp<[[VTC]]> +; FORCE-EVL-NEXT: No successors +; FORCE-EVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; NO-VP-EMPTY: +; NO-VP: vector.ph: +; NO-VP-NEXT: Successor(s): vector loop +; NO-VP-EMPTY: +; NO-VP-NEXT: vector loop: { +; NO-VP-NEXT: vector.body: +; NO-VP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-NEXT: EMIT vp<[[MASK:%.+]]> = active lane mask vp<[[ST]]> vp<[[TC]]> +; NO-VP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]> +; NO-VP-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]> +; NO-VP-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]> vp<[[VTC]]> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) { +; IF-EVL-LABEL: @foo( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-EVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-EVL: for.body.preheader: +; IF-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], +; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; IF-EVL-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.cond.cleanup.loopexit: +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-EVL: for.cond.cleanup: +; IF-EVL-NEXT: ret void +; IF-EVL: for.body: +; IF-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; FORCE-EVL-LABEL: @foo( +; FORCE-EVL-NEXT: entry: +; FORCE-EVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; FORCE-EVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; FORCE-EVL: for.body.preheader: +; FORCE-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; FORCE-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15 +; FORCE-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; FORCE-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; FORCE-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-EVL: vector.body: +; FORCE-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; FORCE-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; FORCE-EVL-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], +; FORCE-EVL-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; FORCE-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; FORCE-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; FORCE-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; FORCE-EVL-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; FORCE-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; FORCE-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; FORCE-EVL-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]]) +; FORCE-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; FORCE-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE-EVL: middle.block: +; FORCE-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; FORCE-EVL: scalar.ph: +; FORCE-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; FORCE-EVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-EVL: for.cond.cleanup.loopexit: +; FORCE-EVL-NEXT: br label [[FOR_COND_CLEANUP]] +; FORCE-EVL: for.cond.cleanup: +; FORCE-EVL-NEXT: ret void +; FORCE-EVL: for.body: +; FORCE-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; FORCE-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; FORCE-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; FORCE-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; FORCE-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; FORCE-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FORCE-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; FORCE-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; NO-VP-LABEL: @foo( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; NO-VP-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; NO-VP: for.body.preheader: +; NO-VP-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; NO-VP-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], +; NO-VP-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; NO-VP-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison) +; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.cond.cleanup.loopexit: +; NO-VP-NEXT: br label [[FOR_COND_CLEANUP]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll @@ -0,0 +1,123 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) { +; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count +; IF-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; IF-EVL-EMPTY: +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop +; IF-EVL-EMPTY: +; IF-EVL-NEXT: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; IF-EVL-NEXT: EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]> +; IF-EVL-NEXT: EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]> vp<[[BETC]]> +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]> +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]> +; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]> vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } + +; FORCE-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; FORCE-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; FORCE-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count +; FORCE-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; FORCE-EVL-EMPTY: +; FORCE-EVL: vector.ph: +; FORCE-EVL-NEXT: Successor(s): vector loop +; FORCE-EVL-EMPTY: +; FORCE-EVL-NEXT: vector loop: { +; FORCE-EVL-NEXT: vector.body: +; FORCE-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; FORCE-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; FORCE-EVL-NEXT: EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]> +; FORCE-EVL-NEXT: EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]> vp<[[BETC]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]> +; FORCE-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; FORCE-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; FORCE-EVL-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]> +; FORCE-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; FORCE-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]> vp<[[VTC]]> +; FORCE-EVL-NEXT: No successors +; FORCE-EVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' { +; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count +; NO-VP-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; NO-VP-EMPTY: +; NO-VP: vector.ph: +; NO-VP-NEXT: Successor(s): vector loop +; NO-VP-EMPTY: +; NO-VP-NEXT: vector loop: { +; NO-VP-NEXT: vector.body: +; NO-VP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-NEXT: EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]> +; NO-VP-NEXT: EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]> vp<[[BETC]]> +; NO-VP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]> +; NO-VP-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]> +; NO-VP-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> +; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-NEXT: WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF + vp<[[IV]]> +; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]> vp<[[VTC]]> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}