Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2245,16 +2245,20 @@ // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = - getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); - Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); + Type *StepType = Step->getType(); + if (Step->getType()->isFloatingPointTy()) + StepType = IntegerType::get(StepType->getContext(), + StepType->getScalarSizeInBits()); + Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = Builder.CreateUIToFP(RuntimeVF, Step->getType()); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. // // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *SplatVF = isa(Mul) ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); @@ -2457,8 +2461,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - assert(isa(Val->getType()) && - "Creation of scalable step vector not yet supported"); auto *ValVTy = cast(Val->getType()); ElementCount VLen = ValVTy->getElementCount(); @@ -2533,19 +2535,43 @@ Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF.getKnownMinValue(); - assert((!VF.isScalable() || Lanes == 1) && - "Should never scalarize a scalable vector"); // Compute the scalar steps and save the results in State. + Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Type *VecIVTy = nullptr; + Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; + if (Lanes > 1 && VF.isScalable()) { + VecIVTy = VectorType::get(ScalarIVTy, VF); + UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); + SplatStep = Builder.CreateVectorSplat(VF, Step); + SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); + } + for (unsigned Part = 0; Part < UF; ++Part) { - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), - ScalarIVTy->getScalarSizeInBits()); - Value *StartIdx = - createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + Value *StartIdx0 = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + + if (Lanes > 1 && VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) - StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); - StartIdx = Builder.CreateBinOp( - AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(Def, Add, Part); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, + Part); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (ScalarIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); + + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. assert((VF.isScalable() || isa(StartIdx)) && @@ -4721,6 +4747,7 @@ case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); + assert(!VF.isScalable() && "Currently unsupported for scalable vectors"); if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -0,0 +1,189 @@ +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine < %s -S 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Test that we can add on the induction variable +; for (long long i = 0; i < n; i++) { +; a[i] = b[i] + i; +; } +; with an unroll factor (interleave count) of 2. + +define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @add_ind64_unrolled( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND1:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %[[EC:.*]] = shl i64 %[[VSCALE]], 1 +; CHECK-NEXT: %[[TMP2:.*]] = insertelement poison, i64 %[[EC]], i32 0 +; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector %[[TMP2]], poison, zeroinitializer +; CHECK-NEXT: %[[TMP3:.*]] = add %[[ECSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VECIND2:.*]] = add %[[IDXSPLT]], %[[TMP3]] +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[STOREVAL1:.*]] = add nsw %[[LOAD1]], %[[VECIND1]] +; CHECK: %[[STOREVAL2:.*]] = add nsw %[[LOAD2]], %[[VECIND2]] +; CHECK: store %[[STOREVAL1]] +; CHECK: store %[[STOREVAL2]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.08 + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08 + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret void +} + + +; Test that we can vectorize a separate induction variable (not used for the branch) +; int r = 0; +; for (long long i = 0; i < n; i++) { +; a[i] = r; +; r += 2; +; } +; with an unroll factor (interleave count) of 1. + + +define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @add_unique_ind32( +; CHECK: vector.ph: +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: %[[INDINIT:.*]] = shl %[[STEPVEC]], shufflevector ( insertelement ( undef, i32 1, i32 0), undef, zeroinitializer) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[INC:.*]] = shl i32 %6, 3 +; CHECK-NEXT: %[[TMP:.*]] = insertelement poison, i32 %[[INC]], i32 0 +; CHECK-NEXT: %[[VECINC:.*]] = shufflevector %[[TMP]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECIND:.*]] = phi [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] +; CHECK: store %[[VECIND]] +; CHECK: %[[VECINDNXT]] = add %[[VECIND]], %[[VECINC]] +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.08 + store i32 %r.07, i32* %arrayidx, align 4 + %add = add nuw nsw i32 %r.07, 2 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body + ret void +} + + +; Test that we can vectorize a separate FP induction variable (not used for the branch) +; float r = 0; +; for (long long i = 0; i < n; i++) { +; a[i] = r; +; r += 2; +; } +; with an unroll factor (interleave count) of 1. + +define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @add_unique_indf32( +; CHECK: vector.ph: +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: %[[TMP1:.*]] = uitofp %[[STEPVEC]] to +; CHECK-NEXT: %[[TMP2:.*]] = fmul %[[TMP1]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %[[INDINIT:.*]] = fadd %[[TMP2]], shufflevector ( insertelement ( poison, float 0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2 +; CHECK-NEXT: %[[TMP4:.*]] = uitofp i32 %[[TMP3]] to float +; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00 +; CHECK-NEXT: %[[TMP5:.*]] = insertelement poison, float %[[INC]], i32 0 +; CHECK-NEXT: %[[VECINC:.*]] = shufflevector %[[TMP5]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECIND:.*]] = phi [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] +; CHECK: store %[[VECIND]] +; CHECK: %[[VECINDNXT]] = fadd %[[VECIND]], %[[VECINC]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %r.07 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %i.08 + store float %r.07, float* %arrayidx, align 4 + %add = fadd float %r.07, 2.000000e+00 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body + ret void +} + +; Test a case where the vectorised induction variable is used to +; generate a mask: +; for (long long i = 0; i < n; i++) { +; if (i & 0x1) +; a[i] = b[i]; +; } + +define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @cond_ind64( +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[MASK:.*]] = trunc %[[VECIND]] to +; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 4, %[[MASK]], poison) +; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[LOAD]], * %{{.*}}, i32 4, %[[MASK]]) +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %and = and i64 %i.08, 1 + %tobool.not = icmp eq i64 %and, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.08 + store i32 %0, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.inc + ret void +} + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 2} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 2} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !7, !3, !8, !5} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.interleave.count", i32 1}