Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1103,6 +1103,12 @@ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } +/// Return the runtime value for VF. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2357,7 +2363,8 @@ Value *ScalarInst = State.get(Def, Instance); Value *VectorValue = State.get(Def, Instance.Part); VectorValue = Builder.CreateInsertElement( - VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane)); + VectorValue, ScalarInst, + Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); State.set(Def, VectorValue, Instance.Part); } @@ -2769,7 +2776,7 @@ auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) - InputInstance.Lane = 0; + InputInstance.Lane = VPLane::getFirstLane(); auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } @@ -4285,22 +4292,21 @@ auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + VPLane Lane = VPLane::getFirstLane(); + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + if (isa(IncomingValue) && + !Cost->isUniformAfterVectorization(cast(IncomingValue), + VF)) + Lane = VPLane::getLastLaneForVF(VF); + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = OrigLoop->isLoopInvariant(IncomingValue) ? IncomingValue : State.get(State.Plan->getVPValue(IncomingValue), - VPIteration(UF - 1, LastLane)); + VPIteration(UF - 1, Lane)); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -8981,7 +8987,7 @@ // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. - if (State.Instance->Lane == 0) { + if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( VectorType::get(getUnderlyingValue()->getType(), State.VF)); @@ -9009,7 +9015,7 @@ assert(State.Instance && "Branch on Mask works only on single instance."); unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane; + unsigned Lane = State.Instance->Lane.getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -89,18 +89,104 @@ /// vectorizer whereas the term "output IR" refers to code that is generated by /// the vectorizer. +/// VPLane provides a way to access lanes in both fixed width and scalable +/// vectors, where for the latter the lane index sometimes needs calculating +/// as a runtime expression. +class VPLane { +public: + /// Kind describes how to interpret Lane. + enum class Kind : uint8_t { + /// For First, Lane is the index into the first N elements of a + /// fixed-vector > or a scalable vector >. + First, + /// For ScalableLast, Lane is the offset from the start of the last + /// N-element subvector in a scalable vector >. For + /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of + /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. + ScalableLast + }; + +private: + /// in [0..VF) + unsigned Lane; + + /// Indicates how the Lane should be interpreted, as described above. + Kind LaneKind; + +public: + VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} + + static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } + + static VPLane getLastLaneForVF(const ElementCount &VF) { + unsigned LaneOffset = VF.getKnownMinValue() - 1; + Kind LaneKind; + if (VF.isScalable()) + // In this case 'LaneOffset' refers to the offset from the start of the + // last subvector with VF.getKnownMinValue() elements. + LaneKind = VPLane::Kind::ScalableLast; + else + LaneKind = VPLane::Kind::First; + return VPLane(LaneOffset, LaneKind); + } + + /// Returns a compile-time known value for the lane index and asserts if the + /// lane can only be calculated at runtime. + unsigned getKnownLane() const { + assert(LaneKind == Kind::First); + return Lane; + } + + /// Returns an expression describing the lane index that can be used at + /// runtime. + Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const; + + /// Returns the Kind of lane offset. + Kind getKind() const { return LaneKind; } + + /// Sets the lane offset and lane kind. + void set(unsigned L, Kind K) { + Lane = L; + LaneKind = K; + } + + /// Returns true if this is the first lane of the whole vector. + bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } + + /// Maps the lane to a cache index based on \p VF. + unsigned mapToCacheIndex(const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + assert(VF.isScalable() && Lane < VF.getKnownMinValue()); + return VF.getKnownMinValue() + Lane; + default: + assert(Lane < VF.getKnownMinValue()); + return Lane; + } + } + + /// Returns the maxmimum number of lanes that we are able to consider + /// caching for \p VF. + static unsigned getNumCachedLanes(const ElementCount &VF) { + return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); + } +}; + /// VPIteration represents a single point in the iteration space of the output /// (vectorized and/or unrolled) IR loop. struct VPIteration { /// in [0..UF) unsigned Part; - /// in [0..VF) - unsigned Lane; + VPLane Lane; + + VPIteration(unsigned Part, unsigned Lane, + VPLane::Kind Kind = VPLane::Kind::First) + : Part(Part), Lane(Lane, Kind) {} - VPIteration(unsigned Part, unsigned Lane) : Part(Part), Lane(Lane) {} + VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {} - bool isFirstIteration() const { return Part == 0 && Lane == 0; } + bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); } }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -157,9 +243,10 @@ auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) return false; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); return Instance.Part < I->second.size() && - Instance.Lane < I->second[Instance.Part].size() && - I->second[Instance.Part][Instance.Lane]; + CacheIdx < I->second[Instance.Part].size() && + I->second[Instance.Part][CacheIdx]; } /// Set the generated Value for a given VPValue and a given Part. @@ -185,10 +272,11 @@ while (PerPartVec.size() <= Instance.Part) PerPartVec.emplace_back(); auto &Scalars = PerPartVec[Instance.Part]; - while (Scalars.size() <= Instance.Lane) + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + while (Scalars.size() <= CacheIdx) Scalars.push_back(nullptr); - assert(!Scalars[Instance.Lane] && "should overwrite existing value"); - Scalars[Instance.Lane] = V; + assert(!Scalars[CacheIdx] && "should overwrite existing value"); + Scalars[CacheIdx] = V; } /// Reset an existing scalar value for \p Def and a given \p Instance. @@ -198,9 +286,10 @@ "need to overwrite existing value"); assert(Instance.Part < Iter->second.size() && "need to overwrite existing value"); - assert(Instance.Lane < Iter->second[Instance.Part].size() && + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + assert(CacheIdx < Iter->second[Instance.Part].size() && "need to overwrite existing value"); - Iter->second[Instance.Part][Instance.Lane] = V; + Iter->second[Instance.Part][CacheIdx] = V; } /// Hold state information used when constructing the CFG of the output IR, Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -47,6 +47,7 @@ using namespace llvm; extern cl::opt EnableVPlanNativePath; +extern Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); #define DEBUG_TYPE "vplan" @@ -58,6 +59,19 @@ return OS; } +Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder, + const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + // Lane = RuntimeVF - VF.getKnownMinValue() + Lane + return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(VF.getKnownMinValue() - Lane)); + case VPLane::Kind::First: + return Builder.getInt32(Lane); + } + llvm_unreachable("Unknown lane kind"); +} + VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) : SubclassID(SC), UnderlyingVal(UV), Def(Def) { if (Def) @@ -244,18 +258,20 @@ if (!Def->getDef()) return Def->getLiveInIRValue(); - if (hasScalarValue(Def, Instance)) - return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; + if (hasScalarValue(Def, Instance)) { + return Data + .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)]; + } assert(hasVectorValue(Def, Instance.Part)); auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); + assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar"); return VecPart; } // TODO: Cache created scalar values. - auto *Extract = - Builder.CreateExtractElement(VecPart, Builder.getInt32(Instance.Lane)); + Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); + auto *Extract = Builder.CreateExtractElement(VecPart, Lane); // set(Def, Extract, Instance); return Extract; } @@ -427,7 +443,7 @@ assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; ++Lane) { - State->Instance->Lane = Lane; + State->Instance->Lane.set(Lane, VPLane::Kind::First); // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -0,0 +1,77 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5} Index: llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -0,0 +1,61 @@ +; RUN: opt -loop-vectorize -dce -instcombine -S < %s 2>%t | FileCheck %s + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store <4 x float> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5}