Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -85,7 +85,8 @@ #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, - Loop *L, DependenceInfo *DI) { + const Loop *L, DependenceInfo *DI, + const DominatorTree *DT) { using ValueVector = SmallVector; ValueVector MemInstr; @@ -147,7 +148,15 @@ Direction = '>'; Dep.push_back(Direction); } else if (D->isScalar(II)) { - Direction = 'S'; + // If the execution of either Src or Dst depends on IV-related + // condition, interchange may mess up memory write order and result + // in incorrect result. + // FIXME: If there're no loads access the same memory location of + // two ouptut dependent stores, it could be safe to interchange. + if (D->isOutput() && !DT->dominates(Src, Dst)) + Direction = '*'; + else + Direction = 'S'; Dep.push_back(Direction); } else { unsigned Dir = D->getDirection(II); @@ -511,7 +520,7 @@ CharMatrix DependencyMatrix; Loop *OuterMostLoop = *(LoopList.begin()); if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, - OuterMostLoop, DI)) { + OuterMostLoop, DI, DT)) { LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; } Index: llvm/test/Transforms/LoopInterchange/pr47523-implicit-out-dep-order.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopInterchange/pr47523-implicit-out-dep-order.ll @@ -0,0 +1,170 @@ +; RUN: opt < %s -loop-interchange -S | FileCheck %s + +@f = dso_local local_unnamed_addr global [4 x [9 x i32]] [[9 x i32] [i32 5, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0], [9 x i32] zeroinitializer, [9 x i32] zeroinitializer, [9 x i32] zeroinitializer], align 4 +@g = common dso_local local_unnamed_addr global i32 0, align 4 + +; If there's output dependency inside loop and Src doesn't dominate Dst, do not interchange +define dso_local i32 @test1(i1 %cond) local_unnamed_addr { +; CHECK-LABEL: @test1( +; CHECK-NEXT: for.preheader: +; CHECK-NEXT: br label [[FOR_HEADER:%.*]] +; CHECK: for.header: +; CHECK-NEXT: [[INDVARS_IV20_I:%.*]] = phi i64 [ 0, [[FOR_PREHEADER:%.*]] ], [ [[INDVARS_IV_NEXT21_I:%.*]], [[FOR_LATCH:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV_I:%.*]] = phi i64 [ 0, [[FOR_HEADER]] ], [ [[INDVARS_IV_NEXT_I:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x [9 x i32]], [4 x [9 x i32]]* @f, i64 0, i64 [[INDVARS_IV_I]], i64 [[INDVARS_IV20_I]] +; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[ARRAYIDX6_I]], align 4 +; CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp eq i32 [[I1]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_I]], label [[LAND_END:%.*]], label [[LAND_RHS:%.*]] +; CHECK: land.rhs: +; CHECK-NEXT: store i32 3, i32* @g, align 4 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_END]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* @g, align 4 +; CHECK-NEXT: [[INC_I:%.*]] = add i32 [[I2]], 1 +; CHECK-NEXT: store i32 [[INC_I]], i32* @g, align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[INDVARS_IV_NEXT_I]] = add nuw nsw i64 [[INDVARS_IV_I]], 1 +; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_I]], 3 +; CHECK-NEXT: br i1 [[EXITCOND_I]], label [[FOR_LATCH]], label [[FOR_BODY]] +; CHECK: for.latch: +; CHECK-NEXT: [[INDVARS_IV_NEXT21_I]] = add nsw i64 [[INDVARS_IV20_I]], 1 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i64 [[INDVARS_IV20_I]], 2 +; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_HEADER]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[I3:%.*]] = load i32, i32* @g, align 4 +; CHECK-NEXT: ret i32 [[I3]] +; +for.preheader: + br label %for.header + +for.header: ; preds = %for.latch, %for.preheader + %indvars.iv20.i = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next21.i, %for.latch ] + br label %for.body + +for.body: ; preds = %if.end, %for.header + %indvars.iv.i = phi i64 [ 0, %for.header ], [ %indvars.iv.next.i, %if.end ] + %arrayidx6.i = getelementptr inbounds [4 x [9 x i32]], [4 x [9 x i32]]* @f, i64 0, i64 %indvars.iv.i, i64 %indvars.iv20.i + %i1 = load i32, i32* %arrayidx6.i, align 4 + %tobool.i = icmp eq i32 %i1, 0 + br i1 %tobool.i, label %land.end, label %land.rhs + +land.rhs: ; preds = %for.body + store i32 3, i32* @g, align 4 + br label %land.end + +land.end: ; preds = %land.rhs, %for.body4 + br i1 %cond, label %if.end, label %if.then + +if.then: ; preds = %land.end + %i2 = load i32, i32* @g, align 4 + %inc.i = add i32 %i2, 1 + store i32 %inc.i, i32* @g, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.end + %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 + %exitcond.i = icmp eq i64 %indvars.iv.next.i, 3 + br i1 %exitcond.i, label %for.latch, label %for.body + +for.latch: ; preds = %if.end + %indvars.iv.next21.i = add nsw i64 %indvars.iv20.i, 1 + %cmp.i = icmp slt i64 %indvars.iv20.i, 2 + br i1 %cmp.i, label %for.header, label %exit + +exit: ; preds = %for.latch + %i3 = load i32, i32* @g, align 4 + ret i32 %i3 +} + +; If there's output dependency inside loop and Src dominates Dst, do interchange +define dso_local i32 @test2(i1 %cond) local_unnamed_addr { +; CHECK-LABEL: @test2( +; CHECK-NEXT: for.preheader: +; CHECK-NEXT: br label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.header.preheader: +; CHECK-NEXT: br label [[FOR_HEADER:%.*]] +; CHECK: for.header: +; CHECK-NEXT: [[INDVARS_IV20_I:%.*]] = phi i64 [ [[INDVARS_IV_NEXT21_I:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_HEADER_PREHEADER:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY_SPLIT:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV_I:%.*]] = phi i64 [ [[TMP0:%.*]], [[IF_END_SPLIT:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_HEADER_PREHEADER]] +; CHECK: for.body.split: +; CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x [9 x i32]], [4 x [9 x i32]]* @f, i64 0, i64 [[INDVARS_IV_I]], i64 [[INDVARS_IV20_I]] +; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[ARRAYIDX6_I]], align 4 +; CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp eq i32 [[I1]], 0 +; CHECK-NEXT: store i32 3, i32* @g, align 4 +; CHECK-NEXT: br i1 [[TOBOOL_I]], label [[LAND_END:%.*]], label [[LAND_RHS:%.*]] +; CHECK: land.rhs: +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* @g, align 4 +; CHECK-NEXT: [[INC_I:%.*]] = add i32 [[I2]], 1 +; CHECK-NEXT: store i32 [[INC_I]], i32* @g, align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[INDVARS_IV_NEXT_I:%.*]] = add nuw nsw i64 [[INDVARS_IV_I]], 1 +; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_I]], 3 +; CHECK-NEXT: br label [[FOR_LATCH]] +; CHECK: if.end.split: +; CHECK-NEXT: [[TMP0]] = add nuw nsw i64 [[INDVARS_IV_I]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 3 +; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.latch: +; CHECK-NEXT: [[INDVARS_IV_NEXT21_I]] = add nsw i64 [[INDVARS_IV20_I]], 1 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i64 [[INDVARS_IV20_I]], 2 +; CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_HEADER]], label [[IF_END_SPLIT]] +; CHECK: exit: +; CHECK-NEXT: [[I3:%.*]] = load i32, i32* @g, align 4 +; CHECK-NEXT: ret i32 [[I3]] +; +for.preheader: + br label %for.header + +for.header: ; preds = %for.latch, %for.preheader + %indvars.iv20.i = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next21.i, %for.latch ] + br label %for.body + +for.body: ; preds = %if.end, %for.header + %indvars.iv.i = phi i64 [ 0, %for.header ], [ %indvars.iv.next.i, %if.end ] + %arrayidx6.i = getelementptr inbounds [4 x [9 x i32]], [4 x [9 x i32]]* @f, i64 0, i64 %indvars.iv.i, i64 %indvars.iv20.i + %i1 = load i32, i32* %arrayidx6.i, align 4 + %tobool.i = icmp eq i32 %i1, 0 + store i32 3, i32* @g, align 4 + br i1 %tobool.i, label %land.end, label %land.rhs + +land.rhs: ; preds = %for.body + br label %land.end + +land.end: ; preds = %land.rhs, %for.body4 + br i1 %cond, label %if.end, label %if.then + +if.then: ; preds = %land.end + %i2 = load i32, i32* @g, align 4 + %inc.i = add i32 %i2, 1 + store i32 %inc.i, i32* @g, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.end + %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 + %exitcond.i = icmp eq i64 %indvars.iv.next.i, 3 + br i1 %exitcond.i, label %for.latch, label %for.body + +for.latch: ; preds = %if.end + %indvars.iv.next21.i = add nsw i64 %indvars.iv20.i, 1 + %cmp.i = icmp slt i64 %indvars.iv20.i, 2 + br i1 %cmp.i, label %for.header, label %exit + +exit: ; preds = %for.latch + %i3 = load i32, i32* @g, align 4 + ret i32 %i3 +}