Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3308,8 +3308,8 @@ SCEVExpander Exp(*SE, DL, "induction"); auto Step = ID.getStep(); auto StartValue = ID.getStartValue(); - assert(Index->getType() == Step->getType() && - "Index type does not match StepValue type"); + assert(Index->getType()->getScalarType() == Step->getType() && + "Index scalar type does not match StepValue type"); // Note: the IR at this point is broken. We cannot use SE to create any new // SCEV and then expand it, hoping that SCEV's simplification will give us @@ -3328,14 +3328,20 @@ return B.CreateAdd(X, Y); }; + // We allow X to be a vector type, in which case Y will potentially be + // splatted into a vector with the same element count. auto CreateMul = [&B](Value *X, Value *Y) { - assert(X->getType() == Y->getType() && "Types don't match!"); + assert(X->getType()->getScalarType() == Y->getType() && + "Types don't match!"); if (auto *CX = dyn_cast(X)) if (CX->isOne()) return Y; if (auto *CY = dyn_cast(Y)) if (CY->isOne()) return X; + VectorType *XVTy = dyn_cast(X->getType()); + if (XVTy && !isa(Y->getType())) + Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); return B.CreateMul(X, Y); }; @@ -3354,6 +3360,7 @@ switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { + assert(!isa(Index->getType())); assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) @@ -3368,9 +3375,10 @@ return B.CreateGEP( StartValue->getType()->getPointerElementType(), StartValue, CreateMul(Index, - Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); + Exp.expandCodeFor(Step, Index->getType()->getScalarType(), GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { + assert(!isa(Index->getType())); assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); auto InductionBinOp = ID.getInductionBinOp(); assert(InductionBinOp && @@ -4769,13 +4777,34 @@ // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); - assert((IsUniform || !VF.isScalable()) && - "Currently unsupported for scalable vectors"); - unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); + unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); + + Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; + if (!IsUniform && VF.isScalable()) { + Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); + UnitStepVec = Builder.CreateStepVector(VecIVTy); + PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); + } for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF( Builder, ConstantInt::get(PtrInd->getType(), Part), VF); + + if (!IsUniform && VF.isScalable()) { + Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); + Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); + Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); + SclrGep->setName("next.gep"); + State.set(PhiR, SclrGep, Part); + // We've cached the whole vector, which means we can support the + // extraction of any lane. In addition, we can also cache the first + // known minimum number of lanes as a further optimization, since + // we frequently want to extract the first lane anyway. This avoids + // an additional extractelement operation to get the first lane. + } + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -114,9 +114,57 @@ ret void } + +; +; Check multiple pointer induction variables where only one is recognized as +; uniform and remains uniform after vectorization. The other pointer induction +; variable is not recognized as uniform and is not uniform after vectorization +; because it is stored to memory. +; + +define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) { +; CHECK-LABEL: @pointer_iv_mixed( +; CHECK: vector.body +; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[IDX]], i32 0 +; CHECK-NEXT: %[[TMP2:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND:.*]] = add %[[TMP2]], %[[STEPVEC]] +; CHECK-NEXT: %[[APTRS:.*]] = getelementptr i32, i32* %a, %[[VECIND]] +; CHECK-NEXT: %{{.*}} = getelementptr i32, i32* %a, i64 %[[IDX]] +; CHECK-NEXT: %[[GEPB:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]] +; CHECK: %[[BPTR:.*]] = bitcast i32** %[[GEPB]] to * +; CHECK-NEXT: store %[[APTRS]], * %[[BPTR]], align 8 + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ] + %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ] + %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ] + %tmp1 = load i32, i32* %p, align 8 + %tmp2 = add i32 %tmp1, %tmp0 + store i32* %p, i32** %q, align 8 + %tmp3 = getelementptr inbounds i32, i32* %p, i32 1 + %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end, !llvm.loop !6 + +for.end: + %tmp5 = phi i32 [ %tmp2, %for.body ] + ret i32 %tmp5 +} + + !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !4 = !{!"llvm.loop.vectorize.enable", i1 true} !5 = !{!"llvm.loop.interleave.count", i32 2} +!6 = distinct !{!6, !1, !7, !3, !4, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 2} +!8 = !{!"llvm.loop.interleave.count", i32 1}