Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1143,6 +1143,12 @@ return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } +/// Return the runtime value for VF. +static Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + namespace llvm { void reportVectorizationFailure(const StringRef DebugMsg, @@ -2466,10 +2472,21 @@ return U; } + Value *Lane; + if (Instance.isNonConstLane()) { + assert(Instance.Lane == VPIteration::LAST_LANE && + "Cannot handle other non-constant lane types"); + ElementCount EC = cast(U->getType())->getElementCount(); + assert(EC.isScalable() && "Only expect scalable VF for non constant lanes"); + Lane = Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(1)); + } else + Lane = Builder.getInt32(Instance.Lane); + // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); + return Builder.CreateExtractElement(U, Lane); } void InnerLoopVectorizer::packScalarIntoVectorValue( @@ -2480,6 +2497,7 @@ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); + assert(!Instance.isNonConstLane() && "Only constant lane indices supported"); VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, Builder.getInt32(Instance.Lane)); VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); @@ -4384,19 +4402,21 @@ auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + unsigned Lane; + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + if (isa(IncomingValue) && + !Cost->isUniformAfterVectorization(cast(IncomingValue), + VF)) + Lane = + VF.isScalable() ? VPIteration::LAST_LANE : VF.getKnownMinValue() - 1; + else + Lane = 0; + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Value *lastIncomingValue = - getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); + getOrCreateScalarValue(IncomingValue, {UF - 1, Lane}); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -92,11 +92,29 @@ /// VPIteration represents a single point in the iteration space of the output /// (vectorized and/or unrolled) IR loop. struct VPIteration { + /// There are cases where we wish to extract a particular lane of a vector, + /// but don't know the exact value at compile time. For example, we may wish + /// to extract the last lane of a scalable vector where the lane index is only + /// known at runtime. Below contains a list of special lane values that try to + /// represent such cases and helps provide a mechanism that still permits the + /// caching of these values. + enum NonConstantLanes { + LAST_LANE = 0xFFFFFFFF, + FIRST_NON_CONST_LANE = LAST_LANE, + LAST_NON_CONST_LANE = LAST_LANE + }; + /// in [0..UF) unsigned Part; /// in [0..VF) unsigned Lane; + + bool isNonConstLane() const { return Lane >= FIRST_NON_CONST_LANE; } + + static unsigned getNumNonConstLanes() { + return LAST_NON_CONST_LANE - FIRST_NON_CONST_LANE + 1; + } }; /// This is a helper struct for maintaining vectorization state. It's used for @@ -158,19 +176,34 @@ return ScalarMapStorage.count(Key); } + /// Returns the number of lanes that we are able to cache. + unsigned getNumCachedLanes() const { + return VF.getKnownMinValue() + VPIteration::getNumNonConstLanes(); + } + + /// Maps the Lane in \p Instance to a cache index. + unsigned mapInstanceLaneToIndex(const VPIteration &Instance) const { + if (Instance.isNonConstLane()) + return VF.getKnownMinValue() + + (Instance.Lane - VPIteration::FIRST_NON_CONST_LANE); + return Instance.Lane; + } + /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF.getKnownMinValue() && - "Queried Scalar Lane is too large."); + assert( + (Instance.Lane < VF.getKnownMinValue() || Instance.isNonConstLane()) && + "Queried Scalar Lane is too large."); if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && + assert(Entry[Instance.Part].size() == getNumCachedLanes() && "ScalarParts has wrong dimensions."); - return Entry[Instance.Part][Instance.Lane] != nullptr; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + return Entry[Instance.Part][CacheIdx] != nullptr; } /// Retrieve the existing vector value that corresponds to \p Key and @@ -184,7 +217,8 @@ /// \p Instance. Value *getScalarValue(Value *Key, const VPIteration &Instance) { assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); - return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + return ScalarMapStorage[Key][Instance.Part][CacheIdx]; } /// Set a vector value associated with \p Key and \p Part. Assumes such a @@ -207,10 +241,11 @@ // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.getKnownMinValue(), nullptr); + Entry[Part].resize(getNumCachedLanes(), nullptr); ScalarMapStorage[Key] = Entry; } - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } /// Reset the vector value associated with \p Key for the given \p Part. @@ -230,7 +265,8 @@ Value *Scalar) { assert(hasScalarValue(Key, Instance) && "Scalar value not set for part and lane"); - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + unsigned CacheIdx = mapInstanceLaneToIndex(Instance); + ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar; } }; @@ -298,6 +334,8 @@ return VecPart; } // TODO: Cache created scalar values. + assert(!Instance.isNonConstLane() && + "Non-constant lanes need handling with callbacks!"); return Builder.CreateExtractElement(VecPart, Builder.getInt32(Instance.Lane)); } Index: llvm/test/Transforms/LoopVectorize/AArch64/neon-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/neon-extract-last-veclane.ll @@ -0,0 +1,73 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+neon -S < %s 2>%t | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %mul.lcssa = phi i32 [ %mul, %for.body ] + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul.lcssa, i32* %arrayidx5, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !0 +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store <4 x float> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %inv.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %mul, %for.body ] + ret float %inv.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !6 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5} Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -0,0 +1,84 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %mul.lcssa = phi i32 [ %mul, %for.body ] + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul.lcssa, i32* %arrayidx5, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !0 +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + %cmp12 = icmp sgt i64 %n, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %inv.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %mul, %for.body ] + ret float %inv.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !6 +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5}