Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4052,8 +4052,11 @@ // getOrCreateVectorValue calls below. Builder.SetInsertPoint(NewPhi); - // The predecessor order is preserved and we can rely on mapping between - // scalar and vector block predecessors. + // The order of predecessors can be different between OrigPhi BB and NewPhi + // BB. Check dominance of incoming values and map incoming value and BB + // according to the dominace. + SmallVector NewIncomingValues; + bool NeedSwap = false; for (unsigned i = 0; i < NumIncomingValues; ++i) { BasicBlock *NewPredBB = VectorBBPredecessors[i]; @@ -4064,6 +4067,23 @@ // Scalar incoming value may need a broadcast Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); + + // Check dominance of NewIncV with NewPredBB. + if (Instruction *NewIncI = dyn_cast(NewIncV)) { + if (!DT->dominates(NewIncI, NewPredBB)) { + NeedSwap = true; + } + } + NewIncomingValues.push_back(NewIncV); + } + + if (NeedSwap) { + std::swap(NewIncomingValues[0], NewIncomingValues[1]); + } + + for (unsigned i = 0; i < NumIncomingValues; ++i) { + BasicBlock *NewPredBB = VectorBBPredecessors[i]; + Value *NewIncV = NewIncomingValues[i]; NewPhi->addIncoming(NewIncV, NewPredBB); } } Index: llvm/test/Transforms/LoopVectorize/vplan-wrong-vec-phi.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/vplan-wrong-vec-phi.ll @@ -0,0 +1,88 @@ +; RUN: opt < %s -jump-threading -loop-vectorize -enable-vplan-native-path -S + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @matrixMulScalar([4 x i32]* nocapture readonly %src1, [4 x i32]* nocapture readonly %src2, [4 x i32]* nocapture %dest, i32 %n) local_unnamed_addr #0 { +entry: + %cmp57 = icmp eq i32 %n, 0 + br i1 %cmp57, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %wide.trip.count65 = zext i32 %n to i64 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph + %indvar = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvar.next, %for.cond.cleanup3 ] + br i1 true, label %for.body4.lr.ph, label %for.cond1.preheader.for.cond.cleanup3_crit_edge + +for.cond1.preheader.for.cond.cleanup3_crit_edge: ; preds = %for.cond1.preheader + br label %for.cond.cleanup3 + +for.body4.lr.ph: ; preds = %for.cond1.preheader + br label %for.body4 + +for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup3 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.cond.cleanup3.loopexit: ; preds = %for.cond.cleanup9 + br label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.cond1.preheader.for.cond.cleanup3_crit_edge, %for.cond.cleanup3.loopexit + %indvar.next = add nuw nsw i64 %indvar, 1 + %exitcond66 = icmp eq i64 %indvar.next, %wide.trip.count65 + br i1 %exitcond66, label %for.cond.cleanup.loopexit, label %for.cond1.preheader, !llvm.loop !2 + +for.body4: ; preds = %for.cond.cleanup9, %for.body4.lr.ph + %indvars.iv60 = phi i64 [ 0, %for.body4.lr.ph ], [ %indvars.iv.next61, %for.cond.cleanup9 ] + %arrayidx6 = getelementptr inbounds [4 x i32], [4 x i32]* %dest, i64 %indvar, i64 %indvars.iv60 + store i32 0, i32* %arrayidx6, align 4, !tbaa !4 + br i1 true, label %for.body10.preheader, label %for.body4.for.cond.cleanup9_crit_edge + +for.body4.for.cond.cleanup9_crit_edge: ; preds = %for.body4 + br label %for.cond.cleanup9 + +for.body10.preheader: ; preds = %for.body4 + br label %for.body10 + +for.cond.cleanup9.loopexit: ; preds = %for.body10 + br label %for.cond.cleanup9 + +for.cond.cleanup9: ; preds = %for.body4.for.cond.cleanup9_crit_edge, %for.cond.cleanup9.loopexit + %indvars.iv.next61 = add nuw nsw i64 %indvars.iv60, 1 + %exitcond63 = icmp eq i64 %indvars.iv.next61, %wide.trip.count65 + br i1 %exitcond63, label %for.cond.cleanup3.loopexit, label %for.body4 + +for.body10: ; preds = %for.body10, %for.body10.preheader + %0 = phi i32 [ 0, %for.body10.preheader ], [ %add, %for.body10 ] + %indvars.iv = phi i64 [ 0, %for.body10.preheader ], [ %indvars.iv.next, %for.body10 ] + %arrayidx18 = getelementptr inbounds [4 x i32], [4 x i32]* %src1, i64 %indvar, i64 %indvars.iv + %1 = load i32, i32* %arrayidx18, align 4, !tbaa !4 + %arrayidx22 = getelementptr inbounds [4 x i32], [4 x i32]* %src2, i64 %indvars.iv, i64 %indvars.iv60 + %2 = load i32, i32* %arrayidx22, align 4, !tbaa !4 + %mul = mul nsw i32 %2, %1 + %add = add nsw i32 %mul, %0 + store i32 %add, i32* %arrayidx6, align 4, !tbaa !4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count65 + br i1 %exitcond, label %for.cond.cleanup9.loopexit, label %for.body10 +} + +attributes #0 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git 37309fb02f60557f18971dc575904c0fc56c91ab)"} +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +