diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3756,10 +3756,44 @@ // the incoming edges. VPBasicBlock *Header = State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); + + // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores + // sank outside of the loop would keep the same order as they had in the + // original loop. + SmallVector ReductionPHIList; for (VPRecipeBase &R : Header->phis()) { if (auto *ReductionPhi = dyn_cast(&R)) - fixReduction(ReductionPhi, State); - else if (auto *FOR = dyn_cast(&R)) + ReductionPHIList.emplace_back(ReductionPhi); + } + stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1, + const VPReductionPHIRecipe *R2) { + auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; + auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; + + // If neither of the recipes has an intermediate store, keep the order the + // same. + if (!IS1 && !IS2) + return false; + + // If only one of the recipes has an intermediate store, then move it + // towards the beginning of the list. + if (IS1 && !IS2) + return true; + + if (!IS1 && IS2) + return false; + + // If both recipes have an intermediate store, then the recipe with the + // later store should be processed earlier. So it should go to the beginning + // of the list. + return DT->dominates(IS2, IS1); + }); + + for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList) + fixReduction(ReductionPhi, State); + + for (VPRecipeBase &R : Header->phis()) { + if (auto *FOR = dyn_cast(&R)) fixFixedOrderRecurrence(FOR, State); } } diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -560,14 +560,13 @@ ret i32 %add.lcssa } -; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch ; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved define void @reduc_add_mul_store(ptr %dst, ptr readonly %src) { ; CHECK-LABEL: define void @reduc_add_mul_store ; CHECK: middle.block: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]]) ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4 ; entry: