Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -518,6 +518,14 @@ const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); + /// A helper function to scalarize a single Instruction in the innermost + /// loop, which is currently only used for scalable vectors. + /// Generates a whole vector equivalent for a given \p Part, which may + /// involve a simple broadcast in the case of uniform instructions. Uses the + /// VPValue operands from \p Operands instead of \p Instr's operands. + void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + unsigned Part, VPTransformState &State); + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. @@ -3063,6 +3071,87 @@ PredicatedInstructions.push_back(Cloned); } +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, + VPUser &User, unsigned Part, + VPTransformState &State) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + + // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated + // for the first part. + if (isa(Instr) && Part != 0) + return; + + setDebugLocFromInst(Instr); + + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), + Builder.GetInsertPoint()); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + + Value *NewPart = nullptr; + + if (OrigLoop->hasLoopInvariantOperands(Instr)) { + // This instruction does not have any operands that vary in the loop. + + // First we clone the scalar instruction for the vector.body and copy + // the metadata across. + Instruction *Cloned = Instr->clone(); + addNewMetadata(Cloned, Instr); + + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + if (!IsVoidRetTy) { + // Since the instruction returns a scalar type we should broadcast that + // value across all lanes of the vector. + Cloned->setName(Instr->getName() + ".cloned"); + NewPart = State.Builder.CreateVectorSplat(State.VF, (Value *)Cloned); + addMetadata(NewPart, Instr); + } else + NewPart = Cloned; + } else { + SmallVector Ops; + + // Create a new set of operands for the vector instruction. If the operand + // is invariant or uniform in the loop we leave it as a scalar, otherwise + // we use the full vector equivalent. + for (unsigned OpI = 0, E = User.getNumOperands(); OpI != E; ++OpI) { + auto *Operand = dyn_cast(Instr->getOperand(OpI)); + if (!Operand || !OrigLoop->contains(Operand) || + Cost->isUniformAfterVectorization(Operand, State.VF)) { + VPIteration InputInstance(Part, 0); + InputInstance.Lane = VPLane::getFirstLane(); + Ops.push_back(State.get(User.getOperand(OpI), InputInstance)); + } else + Ops.push_back(State.get(User.getOperand(OpI), Part)); + } + + switch (Instr->getOpcode()) { + case Instruction::GetElementPtr: { + auto *GEP = cast(Instr); + Value *Ptr = Ops[0]; + + Ops.erase(Ops.begin()); + + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, Ops) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Ops); + NewPart = NewGEP; + break; + } + default: + llvm_unreachable( + "Don't know how to scalarize this instruction for scalable vectors!"); + } + + addMetadata(NewPart, Instr); + } + + State.set(Def, NewPart, Part); +} + PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL) { @@ -8871,10 +8960,23 @@ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); - auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), - IsUniform, IsPredicated); - setRecipe(I, Recipe); - Plan->addVPValue(I, Recipe); + bool IsScalable = Range.Start.isScalable(); + assert(IsScalable == Range.End.isScalable() && + "VFRange contains mixture of scalable and fixed-width VFs!"); + VPRecipeBase *Recipe; + if (IsScalable && !IsUniform) { + auto *R = + new VPScalableReplicateRecipe(I, Plan->mapToVPValues(I->operands())); + setRecipe(I, R); + Plan->addVPValue(I, R); + Recipe = R; + } else { + auto *R = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), + IsUniform, IsPredicated); + setRecipe(I, R); + Plan->addVPValue(I, R); + Recipe = R; + } // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -8883,6 +8985,8 @@ auto *PredR = dyn_cast_or_null(Op->getDef()); if (!PredR) continue; + assert(!IsScalable && + "Don't expect to replicate predicated scalable instructions"); auto *RepR = cast_or_null(PredR->getOperand(0)->getDef()); assert(RepR->isPredicated() && @@ -9577,6 +9681,13 @@ } } +void VPScalableReplicateRecipe::execute(VPTransformState &State) { + assert(State.VF.isScalable() && "Only expect scalable vectors"); + for (unsigned Part = 0; Part < State.UF; ++Part) + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, Part, + State); +} + void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -755,6 +755,7 @@ Def->getVPDefID() == VPRecipeBase::VPBlendSC || Def->getVPDefID() == VPRecipeBase::VPInterleaveSC || Def->getVPDefID() == VPRecipeBase::VPReplicateSC || + Def->getVPDefID() == VPRecipeBase::VPScalableReplicateSC || Def->getVPDefID() == VPRecipeBase::VPReductionSC || Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; @@ -1339,6 +1340,40 @@ bool isPredicated() const { return IsPredicated; } }; +/// VPScalableReplicateRecipe replicates a given instruction producing a whole +/// vector value whose lanes are copies of the original scalar type, one per +/// lane. If the instruction is known to be uniform only one copy, per lane +/// zero, will be generated. +class VPScalableReplicateRecipe : public VPRecipeBase, public VPValue { +public: + template + VPScalableReplicateRecipe(Instruction *I, iterator_range Operands) + : VPRecipeBase(VPScalableReplicateSC, Operands), + VPValue(VPVScalableReplicateSC, I, this) {} + + ~VPScalableReplicateRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPScalableReplicateSC; + } + + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVScalableReplicateSC; + } + + /// Generate replicas of the desired Ingredient. Replicas will be generated + /// for all parts and lanes unless a specific part and lane are specified in + /// the \p State. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for generating conditional branches on the bits of a mask. class VPBranchOnMaskRecipe : public VPRecipeBase { public: Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -594,6 +594,10 @@ auto *R = cast(this); return R->getUnderlyingInstr()->mayHaveSideEffects(); } + case VPScalableReplicateSC: { + auto *R = cast(this); + return R->getUnderlyingInstr()->mayHaveSideEffects(); + } default: return true; } @@ -1165,6 +1169,18 @@ O << " (S->V)"; } +void VPScalableReplicateRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "SCALABLE REPLICATE "; + + if (!getUnderlyingInstr()->getType()->isVoidTy()) { + printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; + printOperands(O, SlotTracker); +} + void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; Index: llvm/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -96,6 +96,7 @@ VPVPredInstPHI, VPVReductionSC, VPVReplicateSC, + VPVScalableReplicateSC, VPVWidenSC, VPVWidenCallSC, VPVWidenGEPSC, @@ -321,6 +322,7 @@ VPPredInstPHISC, VPReductionSC, VPReplicateSC, + VPScalableReplicateSC, VPWidenCallSC, VPWidenCanonicalIVSC, VPWidenGEPSC, Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-vpreplicate.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-vpreplicate.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -S | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; In the test below the PHI instruction: +; %0 = phi i8* [ %incdec.ptr190, %while.body189 ], [ %src, %entry ] +; has multiple uses, i.e. +; 1. As a uniform address for the load, and +; 2. Non-uniform use by the getelementptr + store, which leads to replication. + +define void @phi_multiple_use(i8** noalias %curptr, i8* noalias %src, i8* noalias %cond.i, i64 %N) #0 { +; CHECK-LABEL: @phi_multiple_use( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK-NEXT: {{.*}} = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** %curptr, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add shufflevector ( insertelement ( poison, i64 0, i32 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add [[DOTSPLAT]], [[TMP3]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* %src, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, [[NEXT_GEP6]], i64 1 +; CHECK: store [[TMP5]], * +; CHECK-NEXT: [[TMP6:%.*]] = extractelement [[NEXT_GEP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = add [[WIDE_LOAD]], +; CHECK: store [[TMP9]], * + +entry: + br label %while.body189 + +while.body189: ; preds = %while.body189, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %while.body189 ] + %curchar = phi i8** [ %curchar.next, %while.body189 ], [ %curptr, %entry ] + %0 = phi i8* [ %incdec.ptr190, %while.body189 ], [ %src, %entry ] + %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1 + %curchar.next = getelementptr inbounds i8*, i8** %curchar, i64 1 + store i8* %incdec.ptr190, i8** %curchar, align 8 + %1 = load i8, i8* %0, align 1 + %2 = add i8 %1, 1 + store i8 %2, i8* %0, align 1 + %index.next = add nuw i64 %index, 1 + %3 = icmp ne i64 %index.next, %N + br i1 %3, label %while.body189, label %while.end192.loopexit, !llvm.loop !0 + +while.end192.loopexit: ; preds = %while.body189 + ret void +} + +define void @replicate_noalias_decl(i8** noalias %curptr, i8* noalias %src, i8* noalias %cond.i, i64 %N) #0 { +; CHECK-LABEL: @replicate_noalias_decl( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK-NEXT: {{.*}} = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** %curptr, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* %src, i64 [[TMP2]] +; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl +; CHECK-NOT: tail call void @llvm.experimental.noalias.scope.decl +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = add [[WIDE_LOAD]], +; CHECK: store [[TMP6]], * + +entry: + br label %while.body189 + +while.body189: ; preds = %while.body189, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %while.body189 ] + %curchar = phi i8** [ %curchar.next, %while.body189 ], [ %curptr, %entry ] + %0 = phi i8* [ %incdec.ptr190, %while.body189 ], [ %src, %entry ] + %curchar.next = getelementptr inbounds i8*, i8** %curchar, i64 1 + tail call void @llvm.experimental.noalias.scope.decl(metadata !4) + %1 = load i8, i8* %0, align 1 + %2 = add i8 %1, 1 + store i8 %2, i8* %0, align 1 + %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1 + %index.next = add nuw i64 %index, 1 + %3 = icmp ne i64 %index.next, %N + br i1 %3, label %while.body189, label %while.end192.loopexit, !llvm.loop !0 + +while.end192.loopexit: ; preds = %while.body189 + ret void +} + +define void @replicate_extractvalue(i64* %dst, {i64, i64} %sv) #0 { +; CHECK-LABEL: replicate_extractvalue( +; CHECK: vector.body: ; preds = %vector.body, %vector.ph +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX1]], 0 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { i64, i64 } %sv, 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement poison, i64 [[EXTRACT1]], i32 0 +; CHECK-NEXT: [[SPLAT1:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { i64, i64 } %sv, 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement poison, i64 [[EXTRACT2]], i32 0 +; CHECK-NEXT: [[SPLAT2:%.*]] = shufflevector [[TMP3]], poison, zeroinitializer +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, i64* %dst, i32 [[TMP1]] +; CHECK-NEXT: [[STOREVAL:%.*]] = add [[SPLAT1]], [[SPLAT2]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i64, i64* [[GEP1]], i32 0 +; CHECK-NEXT: [[STOREPTR:%.*]] = bitcast i64* [[GEP2]] to * +; CHECK-NEXT: store [[STOREVAL]], * [[STOREPTR]], align 4 + +entry: + br label %loop.body + +loop.body: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ] + %a = extractvalue { i64, i64 } %sv, 0 + %b = extractvalue { i64, i64 } %sv, 1 + %addr = getelementptr i64, i64* %dst, i32 %iv + %add = add i64 %a, %b + store i64 %add, i64* %addr + %iv.next = add nsw i32 %iv, 1 + %cond = icmp ne i32 %iv.next, 0 + br i1 %cond, label %loop.body, label %exit, !llvm.loop !0 + +exit: + ret void +} + +declare void @llvm.experimental.noalias.scope.decl(metadata) + +attributes #0 = {"target-features"="+sve"} + +!0 = distinct !{!0, !1, !2, !3} +!1 = !{!"llvm.loop.interleave.count", i32 1} +!2 = !{!"llvm.loop.vectorize.width", i32 2} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{ !5 } +!5 = distinct !{ !5, !6 } +!6 = distinct !{ !7 } +!7 = distinct !{ !7, !6 }