diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3617,10 +3617,44 @@ // the incoming edges. VPBasicBlock *Header = State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); + + // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores + // sank outside of the loop would keep the same order as they had in the + // original loop. + SmallVector ReductionPHIList; for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast(&R)) - fixReduction(ReductionPhi, State); - else if (auto *FOR = dyn_cast(&R)) + ReductionPHIList.emplace_back(ReductionPhi); + } + stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1, + const VPReductionPHIRecipe *R2) { + auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; + auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; + + // If neither of the recipes has an intermediate store, keep the order the + // same. + if (!IS1 && !IS2) + return false; + + // If only one of the recipes has an intermediate store, then move it + // towards the beginning of the list. + if (IS1 && !IS2) + return true; + + if (!IS1 && IS2) + return false; + + // If both recipes have an intermediate store, then the recipe with the + // later store should be processed earlier. So it should go to the beginning + // of the list. + return DT->dominates(IS2, IS1); + }); + + for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList) + fixReduction(ReductionPhi, State); + + for (VPRecipeBase &R : Header->phis()) { + if (auto *FOR = dyn_cast(&R)) fixFixedOrderRecurrence(FOR, State); } } diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -558,14 +558,13 @@ } ; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved -; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch ; See https://github.com/llvm/llvm-project/issues/64047 define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr ; CHECK: middle.block: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]]) ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4 ; entry: @@ -619,14 +618,13 @@ } ; Same as above but storing is done to two different pointers and they can be aliased -; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr ; CHECK: middle.block: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]]) -; CHECK-NEXT: store i32 [[TMP2]], ptr %dst2, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) -; CHECK-NEXT: store i32 [[TMP4]], ptr %dst1, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]]) +; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4 ; entry: br label %for.body