Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -85,7 +85,8 @@ #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, - Loop *L, DependenceInfo *DI) { + Loop *L, DependenceInfo *DI, + DominatorTree *DT) { using ValueVector = SmallVector; ValueVector MemInstr; @@ -147,7 +148,15 @@ Direction = '>'; Dep.push_back(Direction); } else if (D->isScalar(II)) { - Direction = 'S'; + // If the execution of either Src or Dst depends on IV-related + // condition, interchange may mess up memory write order and result + // in incorrect result. + // FIXME: If there're no loads access the same memory location of + // two ouptut dependent stores, it could be safe to interchange. + if (D->isOutput() && !DT->dominates(Src, Dst)) + Direction = '*'; + else + Direction = 'S'; Dep.push_back(Direction); } else { unsigned Dir = D->getDirection(II); @@ -511,7 +520,7 @@ CharMatrix DependencyMatrix; Loop *OuterMostLoop = *(LoopList.begin()); if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, - OuterMostLoop, DI)) { + OuterMostLoop, DI, DT)) { LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; } Index: llvm/test/Transforms/LoopInterchange/pr47523-implicit-out-dep-order.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopInterchange/pr47523-implicit-out-dep-order.ll @@ -0,0 +1,105 @@ +; RUN: opt < %s -loop-interchange -S | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct.a = type { i8 } + +@f = dso_local local_unnamed_addr global [4 x [9 x i32]] [[9 x i32] [i32 5, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0], [9 x i32] zeroinitializer, [9 x i32] zeroinitializer, [9 x i32] zeroinitializer], align 4 +@g = common dso_local local_unnamed_addr global i32 0, align 4 +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 +@c = common dso_local global [1 x %struct.a] zeroinitializer, align 1 + +define dso_local i32 @main() local_unnamed_addr { +; CHECK-LABEL: @main( +; CHECK-NEXT: for.cond2.preheader.preheader.i: +; CHECK-NEXT: [[I:%.*]] = sext i32 undef to i64 +; CHECK-NEXT: br label [[FOR_COND2_PREHEADER_I:%.*]] +; CHECK: for.cond2.preheader.i: +; CHECK-NEXT: [[INDVARS_IV20_I:%.*]] = phi i64 [ [[I]], [[FOR_COND2_PREHEADER_PREHEADER_I:%.*]] ], [ [[INDVARS_IV_NEXT21_I:%.*]], [[FOR_END11_I:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY4_I:%.*]] +; CHECK: for.body4.i: +; CHECK-NEXT: [[INDVARS_IV_I:%.*]] = phi i64 [ 0, [[FOR_COND2_PREHEADER_I]] ], [ [[INDVARS_IV_NEXT_I:%.*]], [[IF_END_I:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x [9 x i32]], [4 x [9 x i32]]* @f, i64 0, i64 [[INDVARS_IV_I]], i64 [[INDVARS_IV20_I]] +; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[ARRAYIDX6_I]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp eq i32 [[I1]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_I]], label [[LAND_END_I:%.*]], label [[LAND_RHS_I:%.*]] +; CHECK: land.rhs.i: +; CHECK-NEXT: store i16 3, i16* bitcast (i32* @g to i16*), align 4, [[TBAA4:!tbaa !.*]] +; CHECK-NEXT: br label [[LAND_END_I]] +; CHECK: land.end.i: +; CHECK-NEXT: br i1 icmp eq (i64 urem (i64 zext (i16 ptrtoint ([1 x %struct.a]* @c to i16) to i64), i64 4073709551606), i64 0), label [[IF_END_I]], label [[IF_THEN_I:%.*]] +; CHECK: if.then.i: +; CHECK-NEXT: [[I2:%.*]] = load i16, i16* bitcast (i32* @g to i16*), align 4, [[TBAA4]] +; CHECK-NEXT: [[INC_I:%.*]] = add i16 [[I2]], 1 +; CHECK-NEXT: store i16 [[INC_I]], i16* bitcast (i32* @g to i16*), align 4, [[TBAA4]] +; CHECK-NEXT: br label [[IF_END_I]] +; CHECK: if.end.i: +; CHECK-NEXT: [[INDVARS_IV_NEXT_I]] = add nuw nsw i64 [[INDVARS_IV_I]], 1 +; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_I]], 3 +; CHECK-NEXT: br i1 [[EXITCOND_I]], label [[FOR_END11_I]], label [[FOR_BODY4_I]] +; CHECK: for.end11.i: +; CHECK-NEXT: [[INDVARS_IV_NEXT21_I]] = add nsw i64 [[INDVARS_IV20_I]], 1 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i64 [[INDVARS_IV20_I]], 2 +; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_COND2_PREHEADER_I]], label [[FOR_COND_FOR_END14_CRIT_EDGE_I:%.*]] +; CHECK: for.cond.for.end14_crit_edge.i: +; CHECK-NEXT: br label [[H_EXIT:%.*]] +; CHECK: h.exit: +; CHECK-NEXT: [[I3:%.*]] = load i32, i32* @g, align 4, [[TBAA0]] +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 [[I3]]) +; CHECK-NEXT: ret i32 0 +; +for.cond2.preheader.preheader.i: + %i = sext i32 undef to i64 + br label %for.cond2.preheader.i + +for.cond2.preheader.i: ; preds = %for.end11.i, %for.cond2.preheader.preheader.i + %indvars.iv20.i = phi i64 [ %i, %for.cond2.preheader.preheader.i ], [ %indvars.iv.next21.i, %for.end11.i ] + br label %for.body4.i + +for.body4.i: ; preds = %if.end.i, %for.cond2.preheader.i + %indvars.iv.i = phi i64 [ 0, %for.cond2.preheader.i ], [ %indvars.iv.next.i, %if.end.i ] + %arrayidx6.i = getelementptr inbounds [4 x [9 x i32]], [4 x [9 x i32]]* @f, i64 0, i64 %indvars.iv.i, i64 %indvars.iv20.i + %i1 = load i32, i32* %arrayidx6.i, align 4, !tbaa !0 + %tobool.i = icmp eq i32 %i1, 0 + br i1 %tobool.i, label %land.end.i, label %land.rhs.i + +land.rhs.i: ; preds = %for.body4.i + store i16 3, i16* bitcast (i32* @g to i16*), align 4, !tbaa !4 + br label %land.end.i + +land.end.i: ; preds = %land.rhs.i, %for.body4.i + br i1 icmp eq (i64 urem (i64 zext (i16 ptrtoint ([1 x %struct.a]* @c to i16) to i64), i64 4073709551606), i64 0), label %if.end.i, label %if.then.i + +if.then.i: ; preds = %land.end.i + %i2 = load i16, i16* bitcast (i32* @g to i16*), align 4, !tbaa !4 + %inc.i = add i16 %i2, 1 + store i16 %inc.i, i16* bitcast (i32* @g to i16*), align 4, !tbaa !4 + br label %if.end.i + +if.end.i: ; preds = %if.then.i, %land.end.i + %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 + %exitcond.i = icmp eq i64 %indvars.iv.next.i, 3 + br i1 %exitcond.i, label %for.end11.i, label %for.body4.i + +for.end11.i: ; preds = %if.end.i + %indvars.iv.next21.i = add nsw i64 %indvars.iv20.i, 1 + %cmp.i = icmp slt i64 %indvars.iv20.i, 2 + br i1 %cmp.i, label %for.cond2.preheader.i, label %for.cond.for.end14_crit_edge.i + +for.cond.for.end14_crit_edge.i: ; preds = %for.end11.i + br label %h.exit + +h.exit: ; preds = %for.cond.for.end14_crit_edge.i + %i3 = load i32, i32* @g, align 4, !tbaa !0 + %call4 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %i3) + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) local_unnamed_addr + +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{!5, !5, i64 0} +!5 = !{!"short", !2, i64 0}