Index: llvm/lib/Transforms/Scalar/LoopFuse.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -703,7 +703,6 @@ std::pair> haveIdenticalTripCounts(const FusionCandidate &FC0, const FusionCandidate &FC1) const { - const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); if (isa(TripCount0)) { UncomputableTripCount++; @@ -1040,6 +1039,93 @@ return Fused; } + bool canHoistInst(Instruction &I, + const SmallVector &SafeToHoist, + const SmallVector &NotHoisting, + const FusionCandidate &FC0) const { + // First check if can be hoisted + // If the operands of this instruction dominate the FC0 Preheader + // target block, then it is safe to move them to the end of the FC0 + const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); + assert(FC0PreheaderTarget && + "Expected single successor for loop preheader."); + + for (Use &Op : I.operands()) { + if (auto *OpInst = dyn_cast(Op)) { + bool OpHoisted = is_contained(SafeToHoist, OpInst); + // Check if we have already decided to hoist this operand. In this + // case, it does not dominate FC0 *yet*, but will after we hoist it. + if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { + return false; + } + } + } + + // If this isn't a memory inst, hoisting is safe + if (!I.mayReadFromMemory() && !I.mayWriteToMemory()) { + return true; + } + + LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); + for (Instruction *NotHoistedInst : NotHoisting) { + if (auto D = DI.depends(&I, NotHoistedInst, true)) { + // Dependency is not read-before-write, write-before-read or + // write-before-write + if (D->isFlow() || D->isAnti() || D->isOutput()) { + LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " + "preheader that is not being hoisted.\n"); + return false; + } + } + } + + for (Instruction &HeaderInst : *FC0.Header) { + if (auto D = DI.depends(&I, &HeaderInst, true)) { + // Dependency is not read-before-write, write-before-read or + // write-before-write + if (D->isFlow() || D->isAnti() || D->isOutput()) { + LLVM_DEBUG(dbgs() + << "Inst depends on an instruction in FC0's header.\n"); + return false; + } + } + } + + return true; + } + + bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { + for (User *U : I.users()) { + if (auto *UI{dyn_cast(U)}) { + // Cannot sink if user in loop + // If FC1 has phi users of this value, we cannot sink it into FC1. + if (FC1.L->contains(UI)) { + // Cannot hoist or sink this instruction. No hoisting/sinking + // should take place, loops should not fuse + return false; + } + } + } + + // If this isn't a memory inst, sinking is safe + if (!I.mayReadFromMemory() && !I.mayWriteToMemory()) { + return true; + } + + for (Instruction &HeaderInst : *FC1.Header) { + if (auto D = DI.depends(&I, &HeaderInst, true)) { + // Dependency is not read-before-write, write-before-read or + // write-before-write + if (D->isFlow() || D->isAnti() || D->isOutput()) { + LLVM_DEBUG(dbgs() + << "Inst depends on an instruction in FC1's header.\n"); + return false; + } + } + } + return true; + } + /// Collect instructions in the \p FC1 Preheader that can be hoisted /// to the \p FC0 Preheader or sunk into the \p FC1 Body bool collectMovablePreheaderInsts( @@ -1047,6 +1133,10 @@ SmallVector &SafeToHoist, SmallVector &SafeToSink) const { BasicBlock *FC1Preheader = FC1.Preheader; + // Save the instructions that are not being hoisted, so we know not to hoist + // mem insts that they dominate + SmallVector NotHoisting; + for (Instruction &I : *FC1Preheader) { // Can't move a branch if (&I == FC1Preheader->getTerminator()) @@ -1055,52 +1145,36 @@ // TODO: The case of mayReadFromMemory we can handle but requires // additional work with a dependence analysis so for now we give // up on memory reads. - if (I.mayHaveSideEffects() || I.mayReadFromMemory()) { - LLVM_DEBUG(dbgs() << "Inst: " << I << " may have side-effects.\n"); + if (I.mayThrow() || !I.willReturn()) { + LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n"); return false; } LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); - // First check if can be hoisted - // If the operands of this instruction dominate the FC0 Preheader - // target block, then it is safe to move them to the end of the FC0 - const BasicBlock *FC0PreheaderTarget = - FC0.Preheader->getSingleSuccessor(); - assert(FC0PreheaderTarget && - "Expected single successor for loop preheader."); - bool CanHoistInst = true; - for (Use &Op : I.operands()) { - if (auto *OpInst = dyn_cast(Op)) { - bool OpHoisted = is_contained(SafeToHoist, OpInst); - // Check if we have already decided to hoist this operand. In this - // case, it does not dominate FC0 *yet*, but will after we hoist it. - if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { - CanHoistInst = false; - break; - } + if (auto SI = dyn_cast(&I)) { + if (!SI->isUnordered()) { + LLVM_DEBUG( + dbgs() + << "\tInstruction is volatile or atomic. Cannot move it.\n"); + return false; } } - if (CanHoistInst) { + + if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { SafeToHoist.push_back(&I); LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); } else { LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); + NotHoisting.push_back(&I); - for (User *U : I.users()) { - if (auto *UI{dyn_cast(U)}) { - // Cannot sink if user in loop - // If FC1 has phi users of this value, we cannot sink it into FC1. - if (FC1.L->contains(UI)) { - // Cannot hoist or sink this instruction. No hoisting/sinking - // should take place, loops should not fuse - LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); - return false; - } - } + if (canSinkInst(I, FC1)) { + SafeToSink.push_back(&I); + LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); + } else { + LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); + return false; } - SafeToSink.push_back(&I); - LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); } } LLVM_DEBUG( @@ -1331,7 +1405,6 @@ const FusionCandidate &FC1, SmallVector &HoistInsts, SmallVector &SinkInsts) const { - // All preheader instructions except the branch must be hoisted or sunk assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && "Attempting to sink and hoist preheader instructions, but not all " Index: llvm/test/Transforms/LoopFusion/no_sink_hoist_load.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/no_sink_hoist_load.ll @@ -0,0 +1,42 @@ +; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; CHECK: Could not hoist/sink all instructions + +@A = common global [100 x i32] zeroinitializer, align 16 +define void @sink_preheader(i32 %N) { +; CHECK:pre1: +; CHECK-NEXT: %ptr = alloca i32, align 4 +; CHECK-NEXT: br label %body1 +pre1: + %ptr = alloca i32, align 4 + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %stay = +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + store i32 3, i32* %ptr + br i1 %cond, label %body1, label %pre2 + +; CHECK:pre2: +; CHECK-NEXT: %stay = load i32, i32* %ptr +pre2: + %stay = load i32, i32* %ptr + br label %body2 + +; CHECK: body2: +; CHECK-NOT: %stay = +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + store i32 3, i32* %ptr + br i1 %cond2, label %body2, label %exit + +; CHECK: exit: +; CHECK-NOT: %stay = +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/no_sink_hoist_store.ll =================================================================== --- llvm/test/Transforms/LoopFusion/no_sink_hoist_store.ll +++ llvm/test/Transforms/LoopFusion/no_sink_hoist_store.ll @@ -1,6 +1,5 @@ ; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s ; REQUIRES: asserts -; CHECK: have side-effects. ; CHECK: Could not hoist/sink all instructions @A = common global [100 x i32] zeroinitializer, align 16 @@ -9,15 +8,16 @@ ; CHECK-NEXT: %ptr = alloca i32, align 4 ; CHECK-NEXT: br label %body1 pre1: - %ptr = alloca i32 + %ptr = alloca i32, align 4 br label %body1 ; CHECK:body1: -; CHECK-NOT: %stay = +; CHECK-NOT: store i32 3, i32* %ptr body1: ; preds = %pre1, %body1 %i = phi i32 [%i_next, %body1], [0, %pre1] %i_next = add i32 1, %i %cond = icmp ne i32 %i, %N + %load1 = load i32, i32* %ptr br i1 %cond, label %body1, label %pre2 ; CHECK:pre2: @@ -27,15 +27,16 @@ br label %body2 ; CHECK: body2: -; CHECK-NOT: %stay = +; CHECK-NOT: store i32 3, i32* %ptr body2: ; preds = %pre2, %body2 %i2 = phi i32 [%i_next2, %body2], [0, %pre2] %i_next2 = add i32 1, %i2 %cond2 = icmp ne i32 %i2, %N + %load2 = load i32, i32* %ptr br i1 %cond2, label %body2, label %exit ; CHECK: exit: -; CHECK-NOT: %stay = +; CHECK-NOT: store i32 3, i32* %ptr exit: ret void } Index: llvm/test/Transforms/LoopFusion/no_sink_hoist_unknown_function.ll =================================================================== --- llvm/test/Transforms/LoopFusion/no_sink_hoist_unknown_function.ll +++ llvm/test/Transforms/LoopFusion/no_sink_hoist_unknown_function.ll @@ -1,6 +1,5 @@ ; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s ; REQUIRES: asserts -; CHECK: may have side-effects ; CHECK: Could not hoist/sink all instructions declare void @unknown_func() @@ -12,7 +11,6 @@ br label %body1 ; CHECK:body1: -; CHECK-NOT: %stay = body1: ; preds = %pre1, %body1 %i = phi i32 [%i_next, %body1], [0, %pre1] %i_next = add i32 1, %i @@ -26,7 +24,6 @@ br label %body2 ; CHECK: body2: -; CHECK-NOT: %stay = body2: ; preds = %pre2, %body2 %i2 = phi i32 [%i_next2, %body2], [0, %pre2] %i_next2 = add i32 1, %i2 @@ -34,7 +31,6 @@ br i1 %cond2, label %body2, label %exit ; CHECK: exit: -; CHECK-NOT: %stay = exit: ret void } Index: llvm/test/Transforms/LoopFusion/simple.ll =================================================================== --- llvm/test/Transforms/LoopFusion/simple.ll +++ llvm/test/Transforms/LoopFusion/simple.ll @@ -509,7 +509,7 @@ ; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[ADD]], [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ] ; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[J]] ; CHECK-NEXT: store i32 2, i32* [[AJ]], align 4 -; CHECK-NEXT: [[INC_J]] = add nsw i64 [[J]], 1 +; CHECK-NEXT: [[INC_J]] = add nsw i64 [[J]], [[ADD]] ; CHECK-NEXT: [[CMP_J:%.*]] = icmp slt i64 [[INC_J]], 100 ; CHECK-NEXT: br i1 [[CMP_J]], label [[FOR_SECOND]], label [[FOR_SECOND_EXIT:%.*]] ; CHECK: for.second.exit: @@ -534,7 +534,7 @@ %j = phi i64 [ %add, %for.first.exit ], [ %inc.j, %for.second ] %Aj = getelementptr inbounds i32, i32* %A, i64 %j store i32 2, i32* %Aj, align 4 - %inc.j = add nsw i64 %j, 1 + %inc.j = add nsw i64 %j, %add %cmp.j = icmp slt i64 %inc.j, 100 br i1 %cmp.j, label %for.second, label %for.second.exit Index: llvm/test/Transforms/LoopFusion/sink_load.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/sink_load.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; CHECK: Safe to sink. + +@A = common global [100 x i32] zeroinitializer, align 16 +define void @sink_preheader(i32 %N) { +; CHECK-LABEL: @sink_preheader( +; CHECK-NEXT: pre1: +; CHECK-NEXT: [[PTR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BODY1:%.*]] +; CHECK: body1: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[BODY1]] ], [ 0, [[PRE1:%.*]] ] +; CHECK-NEXT: [[I2:%.*]] = phi i32 [ [[I_NEXT2:%.*]], [[BODY1]] ], [ 0, [[PRE1]] ] +; CHECK-NEXT: [[I_NEXT]] = add i32 1, [[I]] +; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I]], [[N:%.*]] +; CHECK-NEXT: store i32 3, i32* [[PTR]], align 4 +; CHECK-NEXT: [[I_NEXT2]] = add i32 1, [[I2]] +; CHECK-NEXT: [[COND2:%.*]] = icmp ne i32 [[I2]], [[N]] +; CHECK-NEXT: br i1 [[COND2]], label [[BODY1]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[B:%.*]] = load i32, i32* [[PTR]], align 4 +; CHECK-NEXT: ret void +; +pre1: + %ptr = alloca i32 + br label %body1 + +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + store i32 3, i32* %ptr + br i1 %cond, label %body1, label %pre2 + +pre2: + %b = load i32, i32 * %ptr + br label %body2 + +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/sink_store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/sink_store.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +@A = common global [100 x i32] zeroinitializer, align 16 +define void @sink_preheader(i32 %N) { +; CHECK-LABEL: @sink_preheader( +; CHECK-NEXT: pre1: +; CHECK-NEXT: [[PTR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[BODY1:%.*]] +; CHECK: body1: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[BODY1]] ], [ 0, [[PRE1:%.*]] ] +; CHECK-NEXT: [[I2:%.*]] = phi i32 [ [[I_NEXT2:%.*]], [[BODY1]] ], [ 0, [[PRE1]] ] +; CHECK-NEXT: [[I_NEXT]] = add i32 1, [[I]] +; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I]], [[N:%.*]] +; CHECK-NEXT: [[B:%.*]] = load i32, i32* [[PTR]], align 4 +; CHECK-NEXT: [[I_NEXT2]] = add i32 1, [[I2]] +; CHECK-NEXT: [[COND2:%.*]] = icmp ne i32 [[I2]], [[N]] +; CHECK-NEXT: br i1 [[COND2]], label [[BODY1]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: store i32 3, i32* [[PTR]], align 4 +; CHECK-NEXT: ret void +; +pre1: + %ptr = alloca i32 + br label %body1 + +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + %b = load i32, i32 * %ptr + br i1 %cond, label %body1, label %pre2 + +pre2: + store i32 3, i32* %ptr + br label %body2 + +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +exit: + ret void +}