Index: llvm/include/llvm/Transforms/Scalar/GVN.h =================================================================== --- llvm/include/llvm/Transforms/Scalar/GVN.h +++ llvm/include/llvm/Transforms/Scalar/GVN.h @@ -328,6 +328,9 @@ bool PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); + bool PerformLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks); + /// Eliminates partially redundant \p Load, replacing it with \p /// AvailableLoads (connected by Phis if needed). void eliminatePartiallyRedundantLoad( Index: llvm/lib/Transforms/Scalar/GVN.cpp =================================================================== --- llvm/lib/Transforms/Scalar/GVN.cpp +++ llvm/lib/Transforms/Scalar/GVN.cpp @@ -91,13 +91,14 @@ #define DEBUG_TYPE "gvn" -STATISTIC(NumGVNInstr, "Number of instructions deleted"); -STATISTIC(NumGVNLoad, "Number of loads deleted"); -STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); +STATISTIC(NumGVNInstr, "Number of instructions deleted"); +STATISTIC(NumGVNLoad, "Number of loads deleted"); +STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); STATISTIC(NumGVNBlocks, "Number of blocks merged"); -STATISTIC(NumGVNSimpl, "Number of instructions simplified"); +STATISTIC(NumGVNSimpl, "Number of instructions simplified"); STATISTIC(NumGVNEqProp, "Number of equalities propagated"); -STATISTIC(NumPRELoad, "Number of loads PRE'd"); +STATISTIC(NumPRELoad, "Number of loads PRE'd"); +STATISTIC(NumPRELoopLoad, "Number of loop loads PRE'd"); STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax, "Number of blocks speculated as available in " @@ -1447,6 +1448,93 @@ return true; } +bool GVN::PerformLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { + if (!LI) + return false; + + const Loop *L = LI->getLoopFor(Load->getParent()); + // TODO: Generalize to other loop blocks that dominate the latch. + if (!L || L->getHeader() != Load->getParent()) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Latch = L->getLoopLatch(); + if (!Preheader || !Latch) + return false; + + // We can side-exit before the load is executed. + if (ICF->isDominatedByICFIFromSameBlock(Load)) + return false; + + BasicBlock *LoopBlock = nullptr; + for (auto *Blocker : UnavailableBlocks) { + // Blockers from outside the loop are handled in preheader. + if (!L->contains(Blocker)) + continue; + + // Only allow one loop block. Loop header is not less frequently executed + // than each loop block, and likely it is much more frequently executed. But + // in case of multiple loop blocks, we need extra information (such as block + // frequency info) to understand whether it is profitable to PRE into + // multiple loop blocks. + if (LoopBlock) + return false; + + // Do not sink into inner loops. This may be non-profitable. + if (L != LI->getLoopFor(Blocker)) + return false; + + // Blocks that dominate the latch execute on every single iteration, maybe + // except the last one. So PREing into these blocks doesn't make much sense + // in most cases. But the blocks that do not necessarily execute on each + // iteration are sometimes much colder than the header, and this is when + // PRE is potentially profitable. + if (DT->dominates(Blocker, Latch)) + return false; + + // Make sure that the terminator itself doesn't clobber. + if (Blocker->getTerminator()->mayWriteToMemory()) + return false; + + LoopBlock = Blocker; + } + + if (!LoopBlock) + return false; + + // We know that some instructions in LoopBlock do clobber this pointer. But it + // is only safe to reload after it if this memory is still dereferencable. To + // prove that, check either: + // - The load is safe to speculate anywhere; + // - Load is from current stack frame; + // - The blocker doesn't contain functions that may free memory. + Value *LoadPtr = Load->getPointerOperand(); + + bool ProvedNoDealloc = + isa(LoadPtr) || isSafeToSpeculativelyExecute(Load) || + all_of(*LoopBlock, [](Instruction &I) { + if (auto *CI = dyn_cast(&I)) + return !CI->mayWriteToMemory() || + CI->getAttributes().hasAttribute(AttributeList::FunctionIndex, + Attribute::NoFree); + return true; + }); + // TODO: Maybe more ways to analyze this. + if (!ProvedNoDealloc) + return false; + + // TODO: Support critical edge splitting if blocker has more than 1 successor. + MapVector AvailableLoads; + AvailableLoads[LoopBlock] = LoadPtr; + AvailableLoads[Preheader] = LoadPtr; + + LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOOP LOAD: " << *Load << '\n'); + eliminatePartiallyRedundantLoad(Load, ValuesPerBlock, AvailableLoads); + ++NumPRELoopLoad; + return true; +} + static void reportLoadElim(LoadInst *Load, Value *AvailableValue, OptimizationRemarkEmitter *ORE) { using namespace ore; @@ -1544,7 +1632,8 @@ if (!isLoadInLoopPREEnabled() && LI && LI->getLoopFor(Load->getParent())) return Changed; - return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks); + return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) || + PerformLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks); } static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { Index: llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll =================================================================== --- llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll +++ llvm/test/Transforms/GVN/PRE/pre-aliasning-path.ll @@ -7,22 +7,24 @@ declare void @no_side_effect() readonly -; TODO: We can PRE the load into the cold path, removing it from the hot path. define i32 @test_01(i32* %p) { ; CHECK-LABEL: @test_01( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X]], 100 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] ; CHECK: hot_path: ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: cold_path: ; CHECK-NEXT: call void @side_effect_0() #[[ATTR0:[0-9]+]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] @@ -54,22 +56,24 @@ ret i32 %x } -; TODO: We can PRE the load into the cold path, removing it from the hot path. define i32 @test_02(i32* %p) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X]], 100 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] ; CHECK: hot_path: ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: cold_path: ; CHECK-NEXT: call void @side_effect_1(i32 [[X]]) #[[ATTR0]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] Index: llvm/test/Transforms/GVN/PRE/pre-loop-load.ll =================================================================== --- llvm/test/Transforms/GVN/PRE/pre-loop-load.ll +++ llvm/test/Transforms/GVN/PRE/pre-loop-load.ll @@ -7,22 +7,24 @@ declare i32 @personality_function() -; TODO: We can PRE the load away from the hot path. define i32 @test_load_on_cold_path(i32* %p) { ; CHECK-LABEL: @test_load_on_cold_path( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] ; CHECK: hot_path: ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: cold_path: ; CHECK-NEXT: call void @side_effect() #[[ATTR0:[0-9]+]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] @@ -253,18 +255,21 @@ define i32 @test_load_on_exiting_cold_path_01(i32* %p) { ; CHECK-LABEL: @test_load_on_exiting_cold_path_01( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] ; CHECK: hot_path: ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: cold_path: ; CHECK-NEXT: [[SIDE_COND:%.*]] = call i1 @side_effect_cond() #[[ATTR0]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br i1 [[SIDE_COND]], label [[BACKEDGE]], label [[COLD_EXIT:%.*]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] @@ -468,10 +473,11 @@ define i32 @test_load_on_multi_exiting_cold_path(i32* %p) { ; CHECK-LABEL: @test_load_on_multi_exiting_cold_path( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH_1:%.*]] ; CHECK: hot_path: @@ -484,8 +490,10 @@ ; CHECK-NEXT: br i1 [[SIDE_COND_2]], label [[COLD_PATH_3:%.*]], label [[COLD_EXIT]] ; CHECK: cold_path.3: ; CHECK-NEXT: [[SIDE_COND_3:%.*]] = call i1 @side_effect_cond() #[[ATTR0]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br i1 [[SIDE_COND_3]], label [[BACKEDGE]], label [[COLD_EXIT]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH_3]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] @@ -671,10 +679,11 @@ define i32 @test_side_exit_after_merge(i32* %p) { ; CHECK-LABEL: @test_side_exit_after_merge( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] ; CHECK: hot_path: @@ -684,11 +693,14 @@ ; CHECK-NEXT: br i1 [[COND_1]], label [[DO_CALL:%.*]], label [[SIDE_EXITING:%.*]] ; CHECK: do_call: ; CHECK-NEXT: [[SIDE_COND:%.*]] = call i1 @side_effect_cond() #[[ATTR0]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[SIDE_EXITING]] ; CHECK: side_exiting: +; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_PRE]], [[DO_CALL]] ], [ 0, [[COLD_PATH]] ] ; CHECK-NEXT: [[SIDE_COND_PHI:%.*]] = phi i1 [ [[SIDE_COND]], [[DO_CALL]] ], [ true, [[COLD_PATH]] ] ; CHECK-NEXT: br i1 [[SIDE_COND_PHI]], label [[BACKEDGE]], label [[COLD_EXIT:%.*]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X3]], [[SIDE_EXITING]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] @@ -788,11 +800,12 @@ define i32 @test_guard_2(i32* %p, i32 %g) { ; CHECK-LABEL: @test_guard_2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[GUARD_COND:%.*]] = icmp ne i32 [[IV]], [[G:%.*]] -; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]], align 4 ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[GUARD_COND]]) [ "deopt"() ] ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X]], 100 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] @@ -800,8 +813,10 @@ ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: cold_path: ; CHECK-NEXT: call void @side_effect() #[[ATTR0]] +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32* [[P]], align 4 ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]