Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8830,6 +8830,14 @@ auto OpRange = Plan->mapToVPValues(Instr->operands()); Operands = {OpRange.begin(), OpRange.end()}; } + + // Invariant stores inside loop will be deleted and a single store + // with the final reduction value will be added to the exit block + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + continue; + if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( Instr, Operands, Range, Plan)) { // If Instr can be simplified to an existing VPValue, use it. @@ -8865,13 +8873,6 @@ continue; } - // Invariant stores inside loop will be deleted and a single store - // with the final reduction value will be added to the exit block - StoreInst *SI; - if ((SI = dyn_cast(&I)) && - Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) - continue; - // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. VPBasicBlock *NextVPBB = Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll @@ -0,0 +1,57 @@ +; RUN: opt < %s -loop-vectorize -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK +; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK + +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 1) +define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) { +; CHECK-LABEL: @invariant_store_red_exit_is_phi( +; CHECK: vector.body: +; CHECK: %[[VEC_PHI:.*]] = phi [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ] +; CHECK: %[[ACTIVE_LANE_MASK:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n) +; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32 +; CHECK: %[[ICMP:.*]] = icmp eq %[[LOAD]], zeroinitializer +; CHECK: %[[PRED1:.*]] = xor %[[ICMP]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %[[PRED2:.*]] = select %[[ACTIVE_LANE_MASK]], %[[PRED1]], zeroinitializer +; CHECK-NEXT: %[[PREDPHI]] = select %[[PRED2]], {{%.*}}, {{%.*}} +; CHECK-NEXT: %[[SELECT:.*]] = select %[[ACTIVE_LANE_MASK]], %[[PREDPHI]], %[[VEC_PHI]] +; CHECK: middle.block: +; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SELECT]]) +; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4 +entry: + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %red = phi i32 [ 0, %entry ], [ %storemerge, %for.inc ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx6 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %load = load i32, i32* %arrayidx6, align 4 + %tobool7.not = icmp eq i32 %load, 0 + br i1 %tobool7.not, label %if.else, label %if.then8 + +if.then8: ; preds = %for.body + %add = add nsw i32 %red, 91 + br label %for.inc + +if.else: ; preds = %for.body + %add19 = add nsw i32 %red, 3 + br label %for.inc + +for.inc: ; preds = %if.then8, %if.else + %storemerge = phi i32 [ %add19, %if.else ], [ %add, %if.then8 ] + store i32 %storemerge, i32* %dst, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.end.loopexit, %if.then4 + ret void +} + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -328,8 +328,6 @@ ; CHECK: %[[LOAD2:.*]] = load ; CHECK: %[[ADD1:.*]] = add %{{.*}}, %[[LOAD1]] ; CHECK: %[[ADD2:.*]] = add %{{.*}}, %[[LOAD2]] -; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD1]] -; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD2]] ; CHECK: middle.block: ; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[ADD]])