diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -879,6 +879,10 @@ Type *ResultType, const Twine &Name = ""); + /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale + /// will be the same type as that of \p Scaling. + Value *CreateVScale(Constant *Scaling, const Twine &Name = ""); + /// Create a call to intrinsic \p ID with 1 operand which is mangled on its /// type. CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -80,6 +80,17 @@ return CI; } +Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) { + Module *M = GetInsertBlock()->getParent()->getParent(); + assert(isa(Scaling) && "Expected constant integer"); + Function *TheFn = + Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()}); + CallInst *CI = createCallHelper(TheFn, {}, this, Name); + return cast(Scaling)->getSExtValue() == 1 + ? CI + : CreateMul(CI, Scaling); +} + CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, MaybeAlign Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1121,6 +1121,15 @@ return R; } +/// Return a value for Step multiplied by VF. +static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { + assert(isa(Step) && "Expected an integer step"); + Constant *StepVal = ConstantInt::get( + Step->getType(), + cast(Step)->getSExtValue() * VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2277,8 +2286,6 @@ const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); - assert(!VF.isScalable() && - "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2303,11 +2310,24 @@ Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF.getKnownMinValue(); + assert((!VF.isScalable() || Lanes == 1) && + "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant( - ScalarIVTy, VF.getKnownMinValue() * Part + Lane); + auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Value *StartIdx = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + if (ScalarIVTy->isFloatingPointTy()) + StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); + StartIdx = addFastMathFlag(Builder.CreateBinOp( + AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((VF.isScalable() || isa(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2350,10 +2370,11 @@ // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "Scalable vectorization can't lead to any scalarized values."); auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2695,7 +2716,6 @@ Type *ScalarDataTy = getMemInstValueType(Instr); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); @@ -2728,6 +2748,9 @@ InBounds = gep->isInBounds(); if (Reverse) { + assert(!VF.isScalable() && + "Reversing vectors is not yet supported for scalable vectors."); + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = cast(Builder.CreateGEP( @@ -2739,8 +2762,9 @@ if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); + Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); } @@ -2945,8 +2969,7 @@ Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. - assert(!VF.isScalable() && "scalable vectorization is not supported yet"); - Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); + Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2957,6 +2980,8 @@ if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); + assert(!VF.isScalable() && + "Tail folding not yet supported for scalable vectors"); TC = Builder.CreateAdd( TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } @@ -3035,11 +3060,9 @@ // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); if (!Cost->foldTailByMasking()) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); - CheckMinIters = Builder.CreateICmp( - P, Count, - ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), - "min.iters.check"); + Value *Step = + createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); } // Create new preheader for vector loop. LoopVectorPreHeader = @@ -3518,8 +3541,8 @@ Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - assert(!VF.isScalable() && "scalable vectors not yet supported."); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); + Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -4365,7 +4388,6 @@ } void InnerLoopVectorizer::fixLCSSAPHIs() { - assert(!VF.isScalable() && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); @@ -4376,6 +4398,8 @@ cast(IncomingValue), VF) ? 0 : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "scalable vectors dont support non-uniform scalars yet"); // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -5528,7 +5552,6 @@ ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF) { - assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5541,6 +5564,11 @@ unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); if (UserVF.isNonZero()) { + // For now, don't verify legality of scalable vectors. + // This will be addressed properly in https://reviews.llvm.org/D91718. + if (UserVF.isScalable()) + return UserVF; + // If legally unsafe, clamp the user vectorization factor to a safe value. unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); if (UserVF.getFixedValue() <= MaxSafeVF) @@ -5629,6 +5657,9 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); float Cost = expectedCost(ElementCount::getFixed(1)).first; @@ -5938,7 +5969,6 @@ } // Clamp the interleave ranges to reasonable counts. - assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); @@ -5954,6 +5984,13 @@ // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF, provided it // is at least 1. + // + // For scalable vectors we can't know if interleaving is beneficial. It may + // not be beneficial for small loops if none of the lanes in the second vector + // iterations is enabled. However, for larger loops, there is likely to be a + // similar benefit as for fixed-width vectors. For now, we choose to leave + // the InterleaveCount as if vscale is '1', although if some information about + // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { MaxInterleaveCount = std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); @@ -5997,7 +6034,7 @@ // potentially expose ILP opportunities. LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' - << "LV: VF is " << VF.getKnownMinValue() << '\n'); + << "LV: VF is " << VF << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { @@ -6664,8 +6701,6 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { - assert(!VF.isScalable() && - "the cost model is not yet implemented for scalable vectorization"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) @@ -6729,7 +6764,6 @@ } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); if (VF.isScalar()) return; NumPredStores = 0; @@ -7316,7 +7350,6 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); assert(OrigLoop->isInnermost() && "Inner loop expected."); Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. @@ -7339,9 +7372,9 @@ ElementCount MaxVF = MaybeMaxVF.getValue(); assert(MaxVF.isNonZero() && "MaxVF is zero."); - if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) { + if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.getFixedValue()) && + assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. @@ -7352,6 +7385,9 @@ return {{UserVF, 0}}; } + assert(!MaxVF.isScalable() && + "Scalable vectors not yet supported beyond this point"); + for (ElementCount VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. @@ -8695,6 +8731,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. @@ -8717,6 +8754,8 @@ // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); + assert((!State.VF.isScalable() || IsUniform) && + "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, @@ -8870,12 +8909,6 @@ // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); - if (UserVF.isScalable()) { - // TODO: Use scalable UserVF once we've added initial support for scalable - // vectorization. For now we convert it to fixed width, but this will be - // removed in a later patch. - UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); - } // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -9041,13 +9074,6 @@ // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); - if (UserVF.isScalable()) { - // TODO: Use scalable UserVF once we've added initial support for scalable - // vectorization. For now we convert it to fixed width, but this will be - // removed in a later patch. - UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); - } - unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -163,7 +163,6 @@ assert(Instance.Part < UF && "Queried Scalar Part is too large."); assert(Instance.Lane < VF.getKnownMinValue() && "Queried Scalar Lane is too large."); - assert(!VF.isScalable() && "VF is assumed to be non scalable."); if (!hasAnyScalarValue(Key)) return false; diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll --- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll +++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll @@ -13,8 +13,7 @@ for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - %0 = trunc i64 %indvars.iv to i32 - store i32 %0, i32* %arrayidx, align 4 + store i32 42, i32* %arrayidx, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n @@ -25,7 +24,7 @@ } ; CHECK-LABEL: @test2( -; CHECK: store <8 x i32> +; CHECK: store ; CHECK: ret void define void @test2(i32* nocapture %a, i32 %n) #0 { entry: @@ -35,8 +34,7 @@ for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - %0 = trunc i64 %indvars.iv to i32 - store i32 %0, i32* %arrayidx, align 4 + store i32 42, i32* %arrayidx, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n @@ -57,8 +55,7 @@ for.body: ; preds = %entry, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - %0 = trunc i64 %indvars.iv to i32 - store i32 %0, i32* %arrayidx, align 4 + store i32 42, i32* %arrayidx, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1 +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2 + +; CHECKUF1: for.body.preheader: +; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count + +; CHECKUF1: vector.ph: +; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] +; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECKUF1: vector.body: +; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]] +; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + + +; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2). +; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4. + +; CHECKUF2: for.body.preheader: +; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count + +; CHECKUF2: vector.ph: +; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]] +; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECKUF2: vector.body: +; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 +; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]] +; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to * +; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 +; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]] +; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to * +; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]] +; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + +define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}