Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1137,6 +1137,12 @@ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } +/// Return the runtime value for VF. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2454,7 +2460,7 @@ if (OrigLoop->isLoopInvariant(V)) return V; - assert(Instance.Lane > 0 + assert((Instance.Lane > 0 || !Instance.isKnownLane()) ? !Cost->isUniformAfterVectorization(cast(V), VF) : true && "Uniform values only have lane zero"); @@ -2474,10 +2480,21 @@ return U; } + Value *Lane; + switch (Instance.Kind) { + case VPIteration::LaneKind::LK_First: + Lane = Builder.getInt32(Instance.Lane); + break; + case VPIteration::LaneKind::LK_ScalableLast: + Lane = Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(1 + Instance.Lane)); + break; + } + // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); + return Builder.CreateExtractElement(U, Lane); } void InnerLoopVectorizer::packScalarIntoVectorValue( @@ -2488,6 +2505,7 @@ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); + assert(Instance.isKnownLane() && "Only constant lane indices supported"); VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, Builder.getInt32(Instance.Lane)); VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); @@ -2897,8 +2915,10 @@ auto *Operand = dyn_cast(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || - (Cost->isUniformAfterVectorization(Operand, State.VF))) + (Cost->isUniformAfterVectorization(Operand, State.VF))) { InputInstance.Lane = 0; + InputInstance.Kind = VPIteration::LaneKind::LK_First; + } auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } @@ -4395,19 +4415,24 @@ auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + unsigned Lane = 0; + VPIteration::LaneKind Kind = VPIteration::LaneKind::LK_First; + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + if (isa(IncomingValue) && + !Cost->isUniformAfterVectorization(cast(IncomingValue), + VF)) { + if (VF.isScalable()) + // In this case 'Lane' refers to the lane offset from the last lane + Kind = VPIteration::LaneKind::LK_ScalableLast; + else + Lane = VF.getKnownMinValue() - 1; + } + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, VPIteration(UF - 1, LastLane)); + getOrCreateScalarValue(IncomingValue, VPIteration(UF - 1, Lane, Kind)); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -98,9 +98,43 @@ /// in [0..VF) unsigned Lane; - VPIteration(unsigned Part, unsigned Lane) : Part(Part), Lane(Lane) {} + /// LaneKind describes how to interpret Lane. + /// For LK_First, Lane is the index into the first N elements of a + /// fixed-vector > or a scalable vector >. + /// For LK_ScalableLast, Lane is the offset from the last lane of a scalable + /// vector >. For example, a Lane of 0 corresponds to + /// the last lane, a Lane of 1 means last lane - 1, etc. + enum class LaneKind { LK_First, LK_ScalableLast }; - bool isFirstIteration() const { return Part == 0 && Lane == 0; } + /// Indicates how the Lane should be interpreted, as described above. + LaneKind Kind; + + VPIteration(unsigned Part, unsigned Lane, LaneKind Kind = LaneKind::LK_First) + : Part(Part), Lane(Lane), Kind(Kind) {} + + bool isFirstIteration() const { + return Part == 0 && Lane == 0 && Kind == LaneKind::LK_First; + } + + bool isKnownLane() const { return Kind == LaneKind::LK_First; } + + /// Returns the maxmimum number of lanes that we are able to consider + /// caching for \p VF. + static unsigned getNumCachedLanes(const ElementCount &VF) { + return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); + } + + /// Maps the Lane to a cache index based on \p VF. + unsigned mapLaneToIndex(const ElementCount &VF) const { + switch (Kind) { + case LaneKind::LK_ScalableLast: + assert(VF.isScalable()); + return VF.getKnownMinValue() + Lane; + default: + assert(Kind == LaneKind::LK_First); + return Lane; + } + } }; /// This is a helper struct for maintaining vectorization state. It's used for @@ -172,9 +206,10 @@ return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && + assert(Entry[Instance.Part].size() == VPIteration::getNumCachedLanes(VF) && "ScalarParts has wrong dimensions."); - return Entry[Instance.Part][Instance.Lane] != nullptr; + unsigned CacheIdx = Instance.mapLaneToIndex(VF); + return Entry[Instance.Part][CacheIdx] != nullptr; } /// Retrieve the existing vector value that corresponds to \p Key and @@ -188,7 +223,8 @@ /// \p Instance. Value *getScalarValue(Value *Key, const VPIteration &Instance) { assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); - return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; + unsigned CacheIdx = Instance.mapLaneToIndex(VF); + return ScalarMapStorage[Key][Instance.Part][CacheIdx]; } /// Set a vector value associated with \p Key and \p Part. Assumes such a @@ -211,10 +247,11 @@ // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.getKnownMinValue(), nullptr); + Entry[Part].resize(VPIteration::getNumCachedLanes(VF), nullptr); ScalarMapStorage[Key] = Entry; } - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = Instance.mapLaneToIndex(VF); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } /// Reset the vector value associated with \p Key for the given \p Part. @@ -234,7 +271,8 @@ Value *Scalar) { assert(hasScalarValue(Key, Instance) && "Scalar value not set for part and lane"); - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = Instance.mapLaneToIndex(VF); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } }; @@ -304,6 +342,7 @@ auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) return false; + assert(Instance.isKnownLane() && "Non-constant lanes unsupported"); return Instance.Part < I->second.size() && Instance.Lane < I->second[Instance.Part].size() && I->second[Instance.Part][Instance.Lane]; @@ -325,6 +364,7 @@ while (PerPartVec.size() <= Instance.Part) PerPartVec.emplace_back(); auto &Scalars = PerPartVec[Instance.Part]; + assert(Instance.isKnownLane() && "Non-constant lanes unsupported"); while (Scalars.size() <= Instance.Lane) Scalars.push_back(nullptr); Scalars[Instance.Lane] = V; Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -47,6 +47,7 @@ using namespace llvm; extern cl::opt EnableVPlanNativePath; +extern Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); #define DEBUG_TYPE "vplan" @@ -227,12 +228,24 @@ assert(Data.PerPartOutput.count(Def)); auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); + assert(Instance.Lane == 0 && + Instance.Kind == VPIteration::LaneKind::LK_First && + "cannot get lane > 0 for scalar"); return VecPart; } + + Value *Lane; + switch (Instance.Kind) { + case VPIteration::LaneKind::LK_First: + Lane = Builder.getInt32(Instance.Lane); + break; + case VPIteration::LaneKind::LK_ScalableLast: + Lane = Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(1 + Instance.Lane)); + break; + } // TODO: Cache created scalar values. - return Builder.CreateExtractElement(VecPart, - Builder.getInt32(Instance.Lane)); + return Builder.CreateExtractElement(VecPart, Lane); } return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); } Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -0,0 +1,77 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5} Index: llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -0,0 +1,61 @@ +; RUN: opt -loop-vectorize -dce -instcombine -S < %s 2>%t | FileCheck %s + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store <4 x float> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5}