Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1151,6 +1151,12 @@ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } +/// Return the runtime value for VF. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2477,7 +2483,7 @@ if (OrigLoop->isLoopInvariant(V)) return V; - assert(Instance.Lane > 0 + assert(!Instance.Lane.isFirstLane() ? !Cost->isUniformAfterVectorization(cast(V), VF) : true && "Uniform values only have lane zero"); @@ -2497,10 +2503,12 @@ return U; } + Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); + // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); + return Builder.CreateExtractElement(U, Lane); } void InnerLoopVectorizer::packScalarIntoVectorValue( @@ -2511,8 +2519,8 @@ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); - VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, - Builder.getInt32(Instance.Lane)); + VectorValue = Builder.CreateInsertElement( + VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF)); VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); } @@ -2522,7 +2530,8 @@ Value *ScalarInst = State.get(Def, Instance); Value *VectorValue = State.get(Def, Instance.Part); VectorValue = Builder.CreateInsertElement( - VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane)); + VectorValue, ScalarInst, + Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); State.set(Def, VectorValue, Instance.Part); } @@ -2931,7 +2940,7 @@ auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) - InputInstance.Lane = 0; + InputInstance.Lane.set(0, VPLane::Kind::First); auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } @@ -4431,22 +4440,21 @@ auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + VPLane Lane(0, VPLane::Kind::First); + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + if (isa(IncomingValue) && + !Cost->isUniformAfterVectorization(cast(IncomingValue), + VF)) + Lane = VPLane::getLastLaneForVF(VF); + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = OrigLoop->isLoopInvariant(IncomingValue) ? IncomingValue : State.get(State.Plan->getVPValue(IncomingValue), - VPIteration(UF - 1, LastLane)); + VPIteration(UF - 1, Lane)); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -9127,7 +9135,7 @@ // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. - if (State.Instance->Lane == 0) { + if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( VectorType::get(getUnderlyingValue()->getType(), State.VF)); @@ -9157,7 +9165,7 @@ assert(State.Instance && "Branch on Mask works only on single instance."); unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane; + unsigned Lane = State.Instance->Lane.getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -89,18 +89,102 @@ /// vectorizer whereas the term "output IR" refers to code that is generated by /// the vectorizer. +/// VPLane provides a way to access lanes in both fixed width and scalable +/// vectors, where for the latter the lane index sometimes needs calculating +/// as a runtime expression. +class VPLane { +public: + /// Kind describes how to interpret Lane. + enum class Kind : uint8_t { + /// For First, Lane is the index into the first N elements of a + /// fixed-vector > or a scalable vector >. + First, + /// For ScalableLast, Lane is the offset from the start of the last + /// N-element subvector in a scalable vector >. For + /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of + /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. + ScalableLast + }; + +private: + /// in [0..VF) + unsigned Lane; + + /// Indicates how the Lane should be interpreted, as described above. + Kind LaneKind; + +public: + VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} + + static VPLane getLastLaneForVF(const ElementCount &VF) { + unsigned LaneOffset = VF.getKnownMinValue() - 1; + Kind LaneKind; + if (VF.isScalable()) + // In this case 'LaneOffset' refers to the offset from the start of the + // last subvector with VF.getKnownMinValue() elements. + LaneKind = VPLane::Kind::ScalableLast; + else + LaneKind = VPLane::Kind::First; + return VPLane(LaneOffset, LaneKind); + } + + /// Returns a compile-time known value for the lane index and asserts if the + /// lane can only be calculated at runtime. + unsigned getKnownLane() const { + assert(LaneKind == Kind::First); + return Lane; + } + + /// Returns an expression describing the lane index that can be used at + /// runtime. + Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const; + + /// Returns the Kind of lane offset. + Kind getKind() const { return LaneKind; } + + /// Sets the lane offset and lane kind. + void set(unsigned L, Kind K) { + Lane = L; + LaneKind = K; + } + + /// Returns true if this is the first lane of the whole vector. + bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } + + /// Maps the lane to a cache index based on \p VF. + unsigned mapToCacheIndex(const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + assert(VF.isScalable() && Lane < VF.getKnownMinValue()); + return VF.getKnownMinValue() + Lane; + default: + assert(Lane < VF.getKnownMinValue()); + return Lane; + } + } + + /// Returns the maxmimum number of lanes that we are able to consider + /// caching for \p VF. + static unsigned getNumCachedLanes(const ElementCount &VF) { + return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); + } +}; + /// VPIteration represents a single point in the iteration space of the output /// (vectorized and/or unrolled) IR loop. struct VPIteration { /// in [0..UF) unsigned Part; - /// in [0..VF) - unsigned Lane; + VPLane Lane; + + VPIteration(unsigned Part, unsigned Lane, + VPLane::Kind Kind = VPLane::Kind::First) + : Part(Part), Lane(Lane, Kind) {} - VPIteration(unsigned Part, unsigned Lane) : Part(Part), Lane(Lane) {} + VPIteration(unsigned Part, VPLane Lane) : Part(Part), Lane(Lane) {} - bool isFirstIteration() const { return Part == 0 && Lane == 0; } + bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); } }; /// This is a helper struct for maintaining vectorization state. It's used for @@ -165,16 +249,15 @@ /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF.getKnownMinValue() && - "Queried Scalar Lane is too large."); if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && + assert(Entry[Instance.Part].size() == VPLane::getNumCachedLanes(VF) && "ScalarParts has wrong dimensions."); - return Entry[Instance.Part][Instance.Lane] != nullptr; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + return Entry[Instance.Part][CacheIdx] != nullptr; } /// Retrieve the existing vector value that corresponds to \p Key and @@ -188,7 +271,8 @@ /// \p Instance. Value *getScalarValue(Value *Key, const VPIteration &Instance) { assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); - return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + return ScalarMapStorage[Key][Instance.Part][CacheIdx]; } /// Set a vector value associated with \p Key and \p Part. Assumes such a @@ -211,10 +295,11 @@ // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.getKnownMinValue(), nullptr); + Entry[Part].resize(VPLane::getNumCachedLanes(VF), nullptr); ScalarMapStorage[Key] = Entry; } - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } /// Reset the vector value associated with \p Key for the given \p Part. @@ -234,7 +319,8 @@ Value *Scalar) { assert(hasScalarValue(Key, Instance) && "Scalar value not set for part and lane"); - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } }; @@ -299,8 +385,8 @@ if (I == Data.PerPartScalars.end()) return false; return Instance.Part < I->second.size() && - Instance.Lane < I->second[Instance.Part].size() && - I->second[Instance.Part][Instance.Lane]; + Instance.Lane.getKnownLane() < I->second[Instance.Part].size() && + I->second[Instance.Part][Instance.Lane.getKnownLane()]; } /// Set the generated Value for a given VPValue and a given Part. @@ -321,9 +407,9 @@ while (PerPartVec.size() <= Instance.Part) PerPartVec.emplace_back(); auto &Scalars = PerPartVec[Instance.Part]; - while (Scalars.size() <= Instance.Lane) + while (Scalars.size() <= Instance.Lane.getKnownLane()) Scalars.push_back(nullptr); - Scalars[Instance.Lane] = V; + Scalars[Instance.Lane.getKnownLane()] = V; } /// Hold state information used when constructing the CFG of the output IR, Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -47,6 +47,7 @@ using namespace llvm; extern cl::opt EnableVPlanNativePath; +extern Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); #define DEBUG_TYPE "vplan" @@ -58,6 +59,19 @@ return OS; } +Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder, + const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + // Lane = RuntimeVF - VF.getKnownMinValue() + Lane + return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(VF.getKnownMinValue() - Lane)); + case VPLane::Kind::First: + return Builder.getInt32(Lane); + } + llvm_unreachable("Unknown lane kind"); +} + VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) : SubclassID(SC), UnderlyingVal(UV), Def(Def) { if (Def) @@ -221,18 +235,20 @@ return Def->getLiveInIRValue(); if (hasScalarValue(Def, Instance)) - return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; + return Data + .PerPartScalars[Def][Instance.Part][Instance.Lane.getKnownLane()]; if (hasVectorValue(Def, Instance.Part)) { assert(Data.PerPartOutput.count(Def)); auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); + assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar"); return VecPart; } + + Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); // TODO: Cache created scalar values. - return Builder.CreateExtractElement(VecPart, - Builder.getInt32(Instance.Lane)); + return Builder.CreateExtractElement(VecPart, Lane); } return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); } @@ -407,7 +423,7 @@ assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; ++Lane) { - State->Instance->Lane = Lane; + State->Instance->Lane.set(Lane, VPLane::Kind::First); // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -0,0 +1,77 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5} Index: llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -0,0 +1,61 @@ +; RUN: opt -loop-vectorize -dce -instcombine -S < %s 2>%t | FileCheck %s + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store <4 x float> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5}