Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -861,6 +861,10 @@ Type *ResultType, const Twine &Name = ""); + /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale + /// will be the same type as that of \p Scaling. + Value *CreateVScale(Constant *Scaling, const Twine &Name = ""); + /// Create a call to intrinsic \p ID with 1 operand which is mangled on its /// type. CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Index: llvm/lib/IR/IRBuilder.cpp =================================================================== --- llvm/lib/IR/IRBuilder.cpp +++ llvm/lib/IR/IRBuilder.cpp @@ -80,6 +80,17 @@ return CI; } +Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) { + Module *M = GetInsertBlock()->getParent()->getParent(); + assert (isa(Scaling) && "Expected constant integer"); + Function *TheFn = + Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()}); + CallInst *CI = createCallHelper(TheFn, {}, this, Name); + return cast(Scaling)->getSExtValue() == 1 + ? CI + : CreateMul(CI, Scaling); +} + CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, MaybeAlign Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -348,7 +348,6 @@ /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a vector. if (VF.isVector()) { @@ -963,6 +962,15 @@ return R; } +/// Return a value for Step multiplied by VF. +static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { + assert(isa(Step) && "Expected an integer step"); + Constant *StepVal = ConstantInt::get( + Step->getType(), + cast(Step)->getSExtValue() * VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -1225,9 +1233,7 @@ /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); - assert(VF.isVector() && "Expected VF >=2"); - + assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) @@ -2104,8 +2110,6 @@ const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); - assert(!VF.isScalable() && - "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2130,6 +2134,8 @@ Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF.getKnownMinValue(); + assert((!VF.isScalable() || Lanes == 1) && + "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { @@ -2177,10 +2183,11 @@ // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "Scalable vectorization can't lead to any scalarized values."); auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2525,7 +2532,6 @@ Type *ScalarDataTy = getMemInstValueType(Instr); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); @@ -2558,19 +2564,25 @@ InBounds = gep->isInBounds(); if (Reverse) { + Value *Increment = createStepForVF(Builder, Builder.getInt32(-Part), VF); + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); + ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); + Value *Offset = + Builder.CreateSub(Builder.getInt32(1), + createStepForVF(Builder, Builder.getInt32(1), VF)); PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); + ScalarDataTy, PartPtr, Offset)); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); + Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); } @@ -2766,10 +2778,8 @@ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Type *Ty = TC->getType(); - // This is where we can make the step a runtime constant. - assert(!VF.isScalable() && "scalable vectorization is not supported yet"); - Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); + Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2780,6 +2790,8 @@ if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); + assert(!VF.isScalable() && + "Tail folding not yet supported for scalable vectors"); TC = Builder.CreateAdd( TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } @@ -2858,11 +2870,9 @@ // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); if (!Cost->foldTailByMasking()) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); - CheckMinIters = Builder.CreateICmp( - P, Count, - ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), - "min.iters.check"); + Value *Step = + createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); } // Create new preheader for vector loop. LoopVectorPreHeader = @@ -3318,8 +3328,8 @@ Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - assert(!VF.isScalable() && "scalable vectors not yet supported."); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); + Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3699,8 +3709,8 @@ // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. - assert(!VF.isScalable() && - "cannot use scalable ElementCount to determine unroll factor"); + + // TODO: Consider changing this calculation for scalable vectors. setProfileInfoAfterUnrolling( LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); @@ -4163,7 +4173,6 @@ } void InnerLoopVectorizer::fixLCSSAPHIs() { - assert(!VF.isScalable() && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); @@ -4174,6 +4183,8 @@ cast(IncomingValue), VF) ? 0 : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "scalable vectors dont support non-uniform scalars yet"); // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4505,7 +4516,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VPTransformState &State) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: @@ -4894,7 +4904,6 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -5592,7 +5601,6 @@ } // Clamp the interleave ranges to reasonable counts. - assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); @@ -5794,7 +5802,9 @@ if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - assert(!VF.isScalable() && "scalable vectors not yet supported."); + // This assert can be removed, because the answer probably wouldn't be + // any different than for fixed-width vectors. + // assert(!VF.isScalable() && "scalable vectors not yet supported."); return std::max(1, VF.getKnownMinValue() * TypeSize / WidestRegister); }; @@ -6072,7 +6082,6 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); VectorizationCostTy Cost; // For each block. @@ -6319,13 +6328,22 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { - assert(!VF.isScalable() && - "the cost model is not yet implemented for scalable vectorization"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) VF = ElementCount::getFixed(1); + // FIXME: Implement a proper cost-model for scalable vectors. + // For now disable the LoopVectorizationCostModel for scalable vectors, + // because there are too many code-paths that have either: + // `assert(!VF.isScalable())` or `cast(..)`. + // After we fix up those code-paths, we can remove this shortcut. + // Because the only way to vectorize a loop using scalable vectors is by + // forcing it through the LoopHints, the cost-model for scalable vectors + // is not yet relevant anyway. + if (VF.isScalable()) + return {1, true}; + if (VF.isVector() && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); @@ -6384,7 +6402,6 @@ } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); if (VF.isScalar()) return; NumPredStores = 0; @@ -6973,7 +6990,6 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); assert(OrigLoop->isInnermost() && "Inner loop expected."); Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); @@ -8077,6 +8093,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. + assert(!State.VF.isScalable() && "Can't scalarise a scalable vector"); State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. @@ -8097,6 +8114,8 @@ // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); + assert((!State.VF.isScalable() || IsUniform) && + "Can't scalarise a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane}, Index: llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -0,0 +1,65 @@ +; For now this test requires aarch64-registered-target, until we can +; also pass the loop hint as a 'force-vector-width' flag to opt. +; REQUIRES: aarch64-registered-target + +; RUN: opt -S -loop-vectorize -instcombine < %s | FileCheck %s + +source_filename = "loop.c" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; CHECK: for.body.preheader: +; CHECK-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECK-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count + +; CHECK: vector.ph: +; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECK-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] +; CHECK: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECK: vector.body: +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECK: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECK: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECK: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECK: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECK: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECK: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECK: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECK: %index.next = add i64 %index, %[[VSCALEX4]] +; CHECK: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + +define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", !4} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = !{i32 4, i1 1}