diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3335,8 +3335,8 @@ SCEVExpander Exp(*SE, DL, "induction"); auto Step = ID.getStep(); auto StartValue = ID.getStartValue(); - assert(Index->getType() == Step->getType() && - "Index type does not match StepValue type"); + assert(Index->getType()->getScalarType() == Step->getType() && + "Index scalar type does not match StepValue type"); // Note: the IR at this point is broken. We cannot use SE to create any new // SCEV and then expand it, hoping that SCEV's simplification will give us @@ -3355,14 +3355,20 @@ return B.CreateAdd(X, Y); }; + // We allow X to be a vector type, in which case Y will potentially be + // splatted into a vector with the same element count. auto CreateMul = [&B](Value *X, Value *Y) { - assert(X->getType() == Y->getType() && "Types don't match!"); + assert(X->getType()->getScalarType() == Y->getType() && + "Types don't match!"); if (auto *CX = dyn_cast(X)) if (CX->isOne()) return Y; if (auto *CY = dyn_cast(Y)) if (CY->isOne()) return X; + VectorType *XVTy = dyn_cast(X->getType()); + if (XVTy && !isa(Y->getType())) + Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); return B.CreateMul(X, Y); }; @@ -3381,6 +3387,8 @@ switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { + assert(!isa(Index->getType()) && + "Vector indices not supported for integer inductions yet"); assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) @@ -3395,9 +3403,12 @@ return B.CreateGEP( StartValue->getType()->getPointerElementType(), StartValue, CreateMul(Index, - Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); + Exp.expandCodeFor(Step, Index->getType()->getScalarType(), + GetInsertPoint()))); } case InductionDescriptor::IK_FpInduction: { + assert(!isa(Index->getType()) && + "Vector indices not supported for FP inductions yet"); assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); auto InductionBinOp = ID.getInductionBinOp(); assert(InductionBinOp && @@ -4816,13 +4827,33 @@ // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); - assert((IsUniform || !VF.isScalable()) && - "Currently unsupported for scalable vectors"); - unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); + unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); + + bool NeedsVectorIndex = !IsUniform && VF.isScalable(); + Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; + if (NeedsVectorIndex) { + Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); + UnitStepVec = Builder.CreateStepVector(VecIVTy); + PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); + } for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF( Builder, ConstantInt::get(PtrInd->getType(), Part), VF); + + if (NeedsVectorIndex) { + Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); + Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); + Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); + Value *SclrGep = + emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); + SclrGep->setName("next.gep"); + State.set(PhiR, SclrGep, Part); + // We've cached the whole vector, which means we can support the + // extraction of any lane. + continue; + } + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -114,9 +114,68 @@ ret void } + +; +; Check multiple pointer induction variables where only one is recognized as +; uniform and remains uniform after vectorization. The other pointer induction +; variable is not recognized as uniform and is not uniform after vectorization +; because it is stored to memory. +; + +define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) { +; CHECK-LABEL: @pointer_iv_mixed( +; CHECK: vector.body +; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[IDX]], i32 0 +; CHECK-NEXT: %[[TMP2:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND1:.*]] = add %[[TMP2]], %[[STEPVEC]] +; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, %[[VECIND1]] +; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1 +; CHECK-NEXT: %[[TMP3:.*]] = insertelement poison, i64 %[[VSCALE64X2]], i32 0 +; CHECK-NEXT: %[[TMP4:.*]] = shufflevector %[[TMP3]], poison, zeroinitializer +; CHECK-NEXT: %[[TMP5:.*]] = add %[[TMP4]], %[[STEPVEC]] +; CHECK-NEXT: %[[VECIND2:.*]] = add %[[TMP2]], %[[TMP5]] +; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, %[[VECIND2]] +; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]] +; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to * +; CHECK-NEXT: store %[[APTRS1]], * %[[BPTR1]], align 8 +; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE32X2:.*]] = shl i32 %[[VSCALE32]], 1 +; CHECK-NEXT: %[[TMP6:.*]] = sext i32 %[[VSCALE32X2]] to i64 +; CHECK-NEXT: %[[GEPB2:.*]] = getelementptr i32*, i32** %[[GEPB1]], i64 %[[TMP6]] +; CHECK-NEXT: %[[BPTR2:.*]] = bitcast i32** %[[GEPB2]] to * +; CHECK-NEXT store %[[APTRS2]], * %[[BPTR2]], align 8 + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ] + %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ] + %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ] + %tmp1 = load i32, i32* %p, align 8 + %tmp2 = add i32 %tmp1, %tmp0 + store i32* %p, i32** %q, align 8 + %tmp3 = getelementptr inbounds i32, i32* %p, i32 1 + %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end, !llvm.loop !6 + +for.end: + %tmp5 = phi i32 [ %tmp2, %for.body ] + ret i32 %tmp5 +} + + !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !4 = !{!"llvm.loop.vectorize.enable", i1 true} !5 = !{!"llvm.loop.interleave.count", i32 2} +!6 = distinct !{!6, !1, !7, !3, !4, !5} +!7 = !{!"llvm.loop.vectorize.width", i32 2}