Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -861,6 +861,10 @@ Type *ResultType, const Twine &Name = ""); + /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale + /// will be the same type as that of \p Scaling. + Value *CreateVScale(Constant *Scaling, const Twine &Name = ""); + /// Create a call to intrinsic \p ID with 1 operand which is mangled on its /// type. CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Index: llvm/lib/IR/IRBuilder.cpp =================================================================== --- llvm/lib/IR/IRBuilder.cpp +++ llvm/lib/IR/IRBuilder.cpp @@ -80,6 +80,17 @@ return CI; } +Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) { + Module *M = GetInsertBlock()->getParent()->getParent(); + assert(isa(Scaling) && "Expected constant integer"); + Function *TheFn = + Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()}); + CallInst *CI = createCallHelper(TheFn, {}, this, Name); + return cast(Scaling)->getSExtValue() == 1 + ? CI + : CreateMul(CI, Scaling); +} + CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, MaybeAlign Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -962,6 +962,15 @@ return R; } +/// Return a value for Step multiplied by VF. +static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { + assert(isa(Step) && "Expected an integer step"); + Constant *StepVal = ConstantInt::get( + Step->getType(), + cast(Step)->getSExtValue() * VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2101,8 +2110,6 @@ const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); - assert(!VF.isScalable() && - "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2127,11 +2134,24 @@ Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF.getKnownMinValue(); + assert((!VF.isScalable() || Lanes == 1) && + "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = getSignedIntOrFpConstant( - ScalarIVTy, VF.getKnownMinValue() * Part + Lane); + auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Value *StartIdx = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + if (ScalarIVTy->isFloatingPointTy()) + StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); + StartIdx = addFastMathFlag(Builder.CreateBinOp( + AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + // The step returned by `createStepForVF` is a runtime-evaluated value + // when VF is scalable. Otherwise, it should be folded into a Constant. + assert((VF.isScalable() || isa(StartIdx)) && + "Expected StartIdx to be folded to a constant when VF is not " + "scalable"); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2174,10 +2194,11 @@ // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "Scalable vectorization can't lead to any scalarized values."); auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2522,7 +2543,6 @@ Type *ScalarDataTy = getMemInstValueType(Instr); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); @@ -2555,6 +2575,9 @@ InBounds = gep->isInBounds(); if (Reverse) { + assert(!VF.isScalable() && + "Reversing vectors is not yet supported for scalable vectors."); + // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = cast(Builder.CreateGEP( @@ -2566,8 +2589,9 @@ if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { - PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); + Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); + PartPtr = cast( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); } @@ -2764,8 +2788,7 @@ Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. - assert(!VF.isScalable() && "scalable vectorization is not supported yet"); - Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); + Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2776,6 +2799,8 @@ if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); + assert(!VF.isScalable() && + "Tail folding not yet supported for scalable vectors"); TC = Builder.CreateAdd( TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } @@ -2854,11 +2879,9 @@ // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); if (!Cost->foldTailByMasking()) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); - CheckMinIters = Builder.CreateICmp( - P, Count, - ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), - "min.iters.check"); + Value *Step = + createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); } // Create new preheader for vector loop. LoopVectorPreHeader = @@ -3314,8 +3337,8 @@ Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - assert(!VF.isScalable() && "scalable vectors not yet supported."); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); + Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); + Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -4161,7 +4184,6 @@ } void InnerLoopVectorizer::fixLCSSAPHIs() { - assert(!VF.isScalable() && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); @@ -4172,6 +4194,8 @@ cast(IncomingValue), VF) ? 0 : VF.getKnownMinValue() - 1; + assert((!VF.isScalable() || LastLane == 0) && + "scalable vectors dont support non-uniform scalars yet"); // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -5395,6 +5419,9 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { + // FIXME: This can be fixed for scalable vectors later, because at this stage + // the LoopVectorizer will only consider vectorizing a loop with scalable + // vectors when the loop has a hint to enable vectorization for a given VF. assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); float Cost = expectedCost(ElementCount::getFixed(1)).first; @@ -5586,7 +5613,6 @@ } // Clamp the interleave ranges to reasonable counts. - assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); @@ -5601,6 +5627,13 @@ // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF. + // + // For scalable vectors we can't know if interleaving is beneficial. It may + // not be beneficial for small loops if none of the lanes in the second vector + // iterations is enabled. However, for larger loops, there is likely to be a + // similar benefit as for fixed-width vectors. For now, we choose to leave + // the InterleaveCount as if vscale is '1', although if some information about + // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { MaxInterleaveCount = std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); @@ -5636,7 +5669,7 @@ // potentially expose ILP opportunities. LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' - << "LV: VF is " << VF.getKnownMinValue() << '\n'); + << "LV: VF is " << VF << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { @@ -6307,8 +6340,6 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { - assert(!VF.isScalable() && - "the cost model is not yet implemented for scalable vectorization"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) @@ -6372,7 +6403,6 @@ } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); if (VF.isScalar()) return; NumPredStores = 0; @@ -6961,7 +6991,6 @@ Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); assert(OrigLoop->isInnermost() && "Inner loop expected."); Optional MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. @@ -6996,6 +7025,8 @@ ElementCount MaxVF = MaybeMaxVF.getValue(); assert(MaxVF.isNonZero() && "MaxVF is zero."); + assert(!MaxVF.isScalable() && + "Scalable vectors not yet supported beyond this point"); for (ElementCount VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { @@ -8060,6 +8091,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. + assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. @@ -8080,6 +8112,8 @@ // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); + assert((!State.VF.isScalable() || IsUniform) && + "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane}, @@ -8223,12 +8257,6 @@ // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); - if (UserVF.isScalable()) { - // TODO: Use scalable UserVF once we've added initial support for scalable - // vectorization. For now we convert it to fixed width, but this will be - // removed in a later patch. - UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); - } // Plan how to best vectorize, return the best VF and its cost. const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); @@ -8394,13 +8422,6 @@ // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); - if (UserVF.isScalable()) { - // TODO: Use scalable UserVF once we've added initial support for scalable - // vectorization. For now we convert it to fixed width, but this will be - // removed in a later patch. - UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); - } - unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -163,7 +163,6 @@ assert(Instance.Part < UF && "Queried Scalar Part is too large."); assert(Instance.Lane < VF.getKnownMinValue() && "Queried Scalar Lane is too large."); - assert(!VF.isScalable() && "VF is assumed to be non scalable."); if (!hasAnyScalarValue(Key)) return false; Index: llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1 +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2 + +; CHECKUF1: for.body.preheader: +; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count + +; CHECKUF1: vector.ph: +; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] +; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECKUF1: vector.body: +; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]] +; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + + +; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2). +; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4. + +; CHECKUF2: for.body.preheader: +; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count + +; CHECKUF2: vector.ph: +; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]] +; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECKUF2: vector.body: +; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 +; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]] +; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to * +; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 +; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]] +; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to * +; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]] +; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + +define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.vectorize.width", !3} +!3 = !{i32 4, i1 true}