Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1143,6 +1143,12 @@ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } +/// Return the runtime value for VF. +static Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2458,7 +2464,7 @@ if (OrigLoop->isLoopInvariant(V)) return V; - assert(Instance.Lane > 0 + assert((Instance.Lane > 0 || Instance.isNonConstLane()) ? !Cost->isUniformAfterVectorization(cast(V), VF) : true && "Uniform values only have lane zero"); @@ -2478,10 +2484,19 @@ return U; } + Value *Lane; + if (Instance.isNonConstLane()) { + assert(Instance.Kind == VPIteration::LK_ScalableLast && + "Cannot handle other non-constant lane types"); + Lane = Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(1 + Instance.Lane)); + } else + Lane = Builder.getInt32(Instance.Lane); + // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); + return Builder.CreateExtractElement(U, Lane); } void InnerLoopVectorizer::packScalarIntoVectorValue( @@ -2492,6 +2507,7 @@ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); + assert(!Instance.isNonConstLane() && "Only constant lane indices supported"); VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, Builder.getInt32(Instance.Lane)); VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); @@ -2883,7 +2899,8 @@ // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for // the first lane and part. if (isa(Instr)) - if (Instance.Lane != 0 || Instance.Part != 0) + if (Instance.Lane != 0 || Instance.Part != 0 || + Instance.Kind != VPIteration::LK_First) return; setDebugLocFromInst(Builder, Instr); @@ -2901,8 +2918,10 @@ auto *Operand = dyn_cast(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || - (Cost->isUniformAfterVectorization(Operand, State.VF))) + (Cost->isUniformAfterVectorization(Operand, State.VF))) { InputInstance.Lane = 0; + InputInstance.Kind = VPIteration::LK_First; + } auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } @@ -4395,19 +4414,24 @@ auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + unsigned Lane = 0; + VPIteration::LaneKind Kind = VPIteration::LK_First; + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + if (isa(IncomingValue) && + !Cost->isUniformAfterVectorization(cast(IncomingValue), + VF)) { + if (VF.isScalable()) + // In this case 'Lane' refers to the lane offset from the last lane + Kind = VPIteration::LK_ScalableLast; + else + Lane = VF.getKnownMinValue() - 1; + } + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + getOrCreateScalarValue(IncomingValue, {UF - 1, Lane, Kind}); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -97,6 +97,24 @@ /// in [0..VF) unsigned Lane; + + /// There are cases where we wish to extract a particular lane of a vector, + /// but don't know the exact value at compile time. For example, we may wish + /// to extract the last lane of a scalable vector and this requires using a + /// runtime calculation. The enum below describes how the 'Lane' is + /// interpreted. The default is LK_First, which means the lane offset from + /// the start of the vector. If you want to extract a lane at an offset from + /// the end, i.e. LastLane - Lane, you can set the lane 'Kind' to + /// LK_ScalableLast. + enum LaneKind { + LK_First, + LK_ScalableLast, + }; + + /// Indicates whether Lane starts from beginning or works back from the end. + LaneKind Kind; + + bool isNonConstLane() const { return Kind != LK_First; } }; /// This is a helper struct for maintaining vectorization state. It's used for @@ -158,6 +176,20 @@ return ScalarMapStorage.count(Key); } + /// Returns the number of lanes that we are able to cache. + unsigned getNumCachedLanes() const { return 2 * VF.getKnownMinValue(); } + + /// Maps the Lane in \p Instance to a cache index. + unsigned mapInstanceLaneToIndex(const VPIteration &Instance) const { + switch (Instance.Kind) { + case VPIteration::LK_ScalableLast: + return VF.getKnownMinValue() + Instance.Lane; + default: + assert(Instance.Kind == VPIteration::LK_First); + return Instance.Lane; + } + } + /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); @@ -168,9 +200,10 @@ return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && + assert(Entry[Instance.Part].size() == getNumCachedLanes() && "ScalarParts has wrong dimensions."); - return Entry[Instance.Part][Instance.Lane] != nullptr; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + return Entry[Instance.Part][CacheIdx] != nullptr; } /// Retrieve the existing vector value that corresponds to \p Key and @@ -184,7 +217,8 @@ /// \p Instance. Value *getScalarValue(Value *Key, const VPIteration &Instance) { assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); - return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + return ScalarMapStorage[Key][Instance.Part][CacheIdx]; } /// Set a vector value associated with \p Key and \p Part. Assumes such a @@ -207,10 +241,11 @@ // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.getKnownMinValue(), nullptr); + Entry[Part].resize(getNumCachedLanes(), nullptr); ScalarMapStorage[Key] = Entry; } - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } /// Reset the vector value associated with \p Key for the given \p Part. @@ -230,7 +265,8 @@ Value *Scalar) { assert(hasScalarValue(Key, Instance) && "Scalar value not set for part and lane"); - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } }; @@ -300,6 +336,7 @@ auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) return false; + assert(!Instance.isNonConstLane() && "Non-constant lanes unsupported"); return Instance.Part < I->second.size() && Instance.Lane < I->second[Instance.Part].size() && I->second[Instance.Part][Instance.Lane]; @@ -321,6 +358,7 @@ while (PerPartVec.size() <= Instance.Part) PerPartVec.emplace_back(); auto &Scalars = PerPartVec[Instance.Part]; + assert(!Instance.isNonConstLane() && "Non-constant lanes unsupported"); while (Scalars.size() <= Instance.Lane) Scalars.push_back(nullptr); Scalars[Instance.Lane] = V; Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -220,6 +220,9 @@ if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue())) return Def->getLiveInIRValue(); + assert(!Instance.isNonConstLane() && + "Non-constant lanes need handling with callbacks!"); + if (hasScalarValue(Def, Instance)) return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; @@ -287,7 +290,8 @@ void VPBasicBlock::execute(VPTransformState *State) { bool Replica = State->Instance && - !(State->Instance->Part == 0 && State->Instance->Lane == 0); + !(State->Instance->Part == 0 && State->Instance->Lane == 0 && + State->Instance->Kind == VPIteration::LK_First); VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; VPBlockBase *SingleHPred = nullptr; BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. @@ -401,7 +405,7 @@ assert(!State->Instance && "Replicating a Region with non-null instance."); // Enter replicating mode. - State->Instance = {0, 0}; + State->Instance = {0, 0, VPIteration::LK_First}; for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; Index: llvm/test/Transforms/LoopVectorize/AArch64/neon-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/neon-extract-last-veclane.ll @@ -0,0 +1,73 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+neon -S < %s 2>%t | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %mul.lcssa = phi i32 [ %mul, %for.body ] + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul.lcssa, i32* %arrayidx5, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !0 +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store <4 x float> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %inv.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %mul, %for.body ] + ret float %inv.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !6 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5} Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -0,0 +1,84 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %mul.lcssa = phi i32 [ %mul, %for.body ] + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul.lcssa, i32* %arrayidx5, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !0 +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %inv.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %mul, %for.body ] + ret float %inv.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !6 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5}