diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -328,6 +328,12 @@ bool PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); + /// Try to replace a load which executes on each loop iteraiton with Phi + /// translation of load in preheader and load(s) in conditionally executed + /// paths. + bool performLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks); + /// Eliminates partially redundant \p Load, replacing it with \p /// AvailableLoads (connected by Phis if needed). void eliminatePartiallyRedundantLoad( diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -91,13 +91,14 @@ #define DEBUG_TYPE "gvn" -STATISTIC(NumGVNInstr, "Number of instructions deleted"); -STATISTIC(NumGVNLoad, "Number of loads deleted"); -STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); +STATISTIC(NumGVNInstr, "Number of instructions deleted"); +STATISTIC(NumGVNLoad, "Number of loads deleted"); +STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); STATISTIC(NumGVNBlocks, "Number of blocks merged"); -STATISTIC(NumGVNSimpl, "Number of instructions simplified"); +STATISTIC(NumGVNSimpl, "Number of instructions simplified"); STATISTIC(NumGVNEqProp, "Number of equalities propagated"); -STATISTIC(NumPRELoad, "Number of loads PRE'd"); +STATISTIC(NumPRELoad, "Number of loads PRE'd"); +STATISTIC(NumPRELoopLoad, "Number of loop loads PRE'd"); STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax, "Number of blocks speculated as available in " @@ -1447,6 +1448,84 @@ return true; } +bool GVN::performLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { + if (!LI) + return false; + + const Loop *L = LI->getLoopFor(Load->getParent()); + // TODO: Generalize to other loop blocks that dominate the latch. + if (!L || L->getHeader() != Load->getParent()) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Latch = L->getLoopLatch(); + if (!Preheader || !Latch) + return false; + + Value *LoadPtr = Load->getPointerOperand(); + // Must be available in preheader. + if (!L->isLoopInvariant(LoadPtr)) + return false; + + // We plan to hoist the load to preheader without introducing a new fault. + // In order to do it, we need to prove that we cannot side-exit the loop + // once loop header is first entered before execution of the load. + if (ICF->isDominatedByICFIFromSameBlock(Load)) + return false; + + BasicBlock *LoopBlock = nullptr; + for (auto *Blocker : UnavailableBlocks) { + // Blockers from outside the loop are handled in preheader. + if (!L->contains(Blocker)) + continue; + + // Only allow one loop block. Loop header is not less frequently executed + // than each loop block, and likely it is much more frequently executed. But + // in case of multiple loop blocks, we need extra information (such as block + // frequency info) to understand whether it is profitable to PRE into + // multiple loop blocks. + if (LoopBlock) + return false; + + // Do not sink into inner loops. This may be non-profitable. + if (L != LI->getLoopFor(Blocker)) + return false; + + // Blocks that dominate the latch execute on every single iteration, maybe + // except the last one. So PREing into these blocks doesn't make much sense + // in most cases. But the blocks that do not necessarily execute on each + // iteration are sometimes much colder than the header, and this is when + // PRE is potentially profitable. + if (DT->dominates(Blocker, Latch)) + return false; + + // Make sure that the terminator itself doesn't clobber. + if (Blocker->getTerminator()->mayWriteToMemory()) + return false; + + LoopBlock = Blocker; + } + + if (!LoopBlock) + return false; + + // Make sure the memory at this pointer cannot be freed, therefore we can + // safely reload from it after clobber. + if (LoadPtr->canBeFreed()) + return false; + + // TODO: Support critical edge splitting if blocker has more than 1 successor. + MapVector AvailableLoads; + AvailableLoads[LoopBlock] = LoadPtr; + AvailableLoads[Preheader] = LoadPtr; + + LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOOP LOAD: " << *Load << '\n'); + eliminatePartiallyRedundantLoad(Load, ValuesPerBlock, AvailableLoads); + ++NumPRELoopLoad; + return true; +} + static void reportLoadElim(LoadInst *Load, Value *AvailableValue, OptimizationRemarkEmitter *ORE) { using namespace ore; @@ -1544,7 +1623,8 @@ if (!isLoadInLoopPREEnabled() && LI && LI->getLoopFor(Load->getParent())) return Changed; - return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks); + return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) || + performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks); } static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { diff --git a/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll b/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll --- a/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll +++ b/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll @@ -7,22 +7,25 @@ declare i32 @personality_function() -; TODO: We can PRE the load from gc-managed memory away from the hot path. +; We can PRE the load from gc-managed memory away from the hot path. define i32 @test_load_on_cold_path_gc(i32 addrspace(1)* %p) gc "statepoint-example" personality i32 ()* @"personality_function" { ; CHECK-LABEL: @test_load_on_cold_path_gc( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_PRE1:%.*]] = load i32, i32 addrspace(1)* [[P:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[X:%.*]] = load i32, i32 addrspace(1)* [[P:%.*]], align 4 +; CHECK-NEXT: [[X:%.*]] = phi i32 [ [[X_PRE1]], [[ENTRY:%.*]] ], [ [[X2:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]] ; CHECK: hot_path: ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: cold_path: ; CHECK-NEXT: call void @may_free_memory() +; CHECK-NEXT: [[X_PRE:%.*]] = load i32, i32 addrspace(1)* [[P]], align 4 ; CHECK-NEXT: br label [[BACKEDGE]] ; CHECK: backedge: +; CHECK-NEXT: [[X2]] = phi i32 [ [[X_PRE]], [[COLD_PATH]] ], [ [[X]], [[HOT_PATH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[X]] ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]