Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -210,7 +210,7 @@ SmallVectorImpl &, SmallVectorImpl &, PredIteratorCache &, LoopInfo *, DominatorTree *, const TargetLibraryInfo *, Loop *, MemorySSAUpdater *, ICFLoopSafetyInfo *, - OptimizationRemarkEmitter *); + OptimizationRemarkEmitter *, SmallPtrSetImpl &); /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. Index: llvm/include/llvm/Transforms/Utils/SSAUpdater.h =================================================================== --- llvm/include/llvm/Transforms/Utils/SSAUpdater.h +++ llvm/include/llvm/Transforms/Utils/SSAUpdater.h @@ -169,6 +169,12 @@ /// Called to update debug info associated with the instruction. virtual void updateDebugInfo(Instruction *I) const {} + + /// Used to indicate if we want to keep stores by hoisting loads only. + virtual bool shouldPreserveStore() const { return false; } + + /// Use preserved. + virtual void instructionPreserved(Instruction *I) const {} }; } // end namespace llvm Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -117,6 +117,13 @@ cl::desc("Max num uses visited for identifying load " "invariance in loop using invariant start (default = 8)")); +// Hoist the load without hoisting the store. Some targets (such as aarch64) +// will gain benefits out of this, since there will be better register +// promotion. +static cl::opt EnableLoadHoistOnly( + "licm-load-hoisting-only", cl::Hidden, cl::init(false), + cl::desc("Enable hoisting loads without hoisting the stores")); + // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to // address the same issue. This flag applies only when LICM uses MemorySSA @@ -183,7 +190,8 @@ static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L, function_ref Fn); static SmallVector, 0> -collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L); +collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L, + SmallPtrSetImpl &SavedInsts); namespace { struct LoopInvariantCodeMotion { @@ -404,6 +412,8 @@ ICFLoopSafetyInfo SafetyInfo; SafetyInfo.computeLoopSafetyInfo(L); + llvm::SmallPtrSet SavedInstructions; + // We want to visit all of the instructions in this loop... that are not parts // of our subloops (they have already had their invariants hoisted out of // their loop, into this loop, so there is no need to process the BODIES of @@ -463,10 +473,10 @@ do { LocalPromoted = false; for (const SmallSetVector &PointerMustAliases : - collectPromotionCandidates(MSSA, AA, L)) { + collectPromotionCandidates(MSSA, AA, L, SavedInstructions)) { LocalPromoted |= promoteLoopAccessesToScalars( - PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, - LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE); + PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, + DT, TLI, L, &MSSAU, &SafetyInfo, ORE, SavedInstructions); } Promoted |= LocalPromoted; } while (LocalPromoted); @@ -496,6 +506,7 @@ if (Changed && SE) SE->forgetLoopDispositions(L); + return Changed; } @@ -1858,6 +1869,8 @@ bool UnorderedAtomic; AAMDNodes AATags; ICFLoopSafetyInfo &SafetyInfo; + bool HoistLoadOnly; + SmallPtrSetImpl &SavedInsts; // We're about to add a use of V in a loop exit block. Insert an LCSSA phi // (if legal) if doing so would add an out-of-loop use to an instruction @@ -1884,12 +1897,14 @@ SmallVectorImpl &MSSAIP, PredIteratorCache &PIC, MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl, int alignment, bool UnorderedAtomic, const AAMDNodes &AATags, - ICFLoopSafetyInfo &SafetyInfo) + ICFLoopSafetyInfo &SafetyInfo, bool HoistLoadOnly, + SmallPtrSetImpl &SavedInsts) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)), Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), - SafetyInfo(SafetyInfo) {} + SafetyInfo(SafetyInfo), HoistLoadOnly(HoistLoadOnly), + SavedInsts(SavedInsts) {} bool isInstInList(Instruction *I, const SmallVectorImpl &) const override { @@ -1901,7 +1916,7 @@ return PointerMustAliases.count(Ptr); } - void doExtraRewritesBeforeFinalDeletion() override { + void insertStoresInLoopExitBlocks() { // Insert stores after in the loop exit blocks. Each exit block gets a // store of the live-out values that feed them. Since we've already told // the SSA updater about the defs in the loop and the preheader @@ -1935,10 +1950,21 @@ } } + void doExtraRewritesBeforeFinalDeletion() override { + if (!HoistLoadOnly) + insertStoresInLoopExitBlocks(); + } + void instructionDeleted(Instruction *I) const override { SafetyInfo.removeInstruction(I); MSSAU->removeMemoryAccess(I); } + + void instructionPreserved(Instruction *I) const override { + SavedInsts.insert(I); + } + + bool shouldPreserveStore() const override { return HoistLoadOnly; } }; bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L, @@ -1981,6 +2007,9 @@ /// loop and moving loads to before the loop. We do this by looping over /// the stores in the loop, looking for stores to Must pointers which are /// loop invariant. +/// There is a special case where we move the load before the loop by +/// preserving the stores as is, since it may improve register promotion +/// for some targets. This is controled by the EnableLoadHoistOnly flag. /// bool llvm::promoteLoopAccessesToScalars( const SmallSetVector &PointerMustAliases, @@ -1989,7 +2018,8 @@ SmallVectorImpl &MSSAInsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE) { + OptimizationRemarkEmitter *ORE, + SmallPtrSetImpl &SavedInsts) { // Verify inputs. assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -2197,9 +2227,14 @@ } } + bool ShouldTryLoadHoistingOnly = false; // If we've still failed to prove we can sink the store, give up. - if (!SafeToInsertStore) - return false; + if (!SafeToInsertStore) { + if (DereferenceableInPH && EnableLoadHoistOnly) + ShouldTryLoadHoistingOnly = true; + else + return false; + } // Otherwise, this is safe to promote, lets do it! LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr @@ -2223,7 +2258,7 @@ LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL, Alignment.value(), SawUnorderedAtomic, AATags, - *SafetyInfo); + *SafetyInfo, ShouldTryLoadHoistingOnly, SavedInsts); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. @@ -2268,12 +2303,16 @@ } static SmallVector, 0> -collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) { +collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L, + SmallPtrSetImpl &SavedInsts) { AliasSetTracker AST(*AA); - auto IsPotentiallyPromotable = [L](const Instruction *I) { - if (const auto *SI = dyn_cast(I)) + auto IsPotentiallyPromotable = [L, &SavedInsts](const Instruction *I) { + if (const auto *SI = dyn_cast(I)) { + if (SavedInsts.count(I)) + return false; return L->isLoopInvariant(SI->getPointerOperand()); + } if (const auto *LI = dyn_cast(I)) return L->isLoopInvariant(LI->getPointerOperand()); return false; Index: llvm/lib/Transforms/Utils/SSAUpdater.cpp =================================================================== --- llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -443,9 +443,20 @@ // Allow the client to do stuff before we start nuking things. doExtraRewritesBeforeFinalDeletion(); + // Hoist the loads only if we didn't manage to sink the stores safely. + bool PreserveTheStores = shouldPreserveStore(); + // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. for (Instruction *User : Insts) { + // Keep the store since the load is moved above only. + if (PreserveTheStores && isa(User)) { + // We pretend as if this was deleted, since we don't want + // to track this mem access here any more. + instructionPreserved(User); + continue; + } + // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively Index: llvm/test/Transforms/LICM/AArch64/hoist-load-without-store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LICM/AArch64/hoist-load-without-store.ll @@ -0,0 +1,92 @@ +;; C reproducer: +;; int u, v; +;; +;; void f(int a[restrict], int b[restrict], int n) { +;; for (int i = 0; i < n; ++i) { +;; if (a[i]) { +;; ++u; +;; break; +;; } +;; ++u; +;; if (b[i]) +;; ++v; +;; } +;; } + +; RUN: opt -licm -licm-load-hoisting-only -mtriple aarch64-linux-gnu -S < %s | FileCheck %s + +; CHECK-LABEL: for.body.preheader: +; CHECK: %u.promoted = load i32, i32* @u +; CHECK: %v.promoted = load i32, i32* @v + +; ModuleID = './test.ll' +source_filename = "test.c" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux" + +@u = dso_local local_unnamed_addr global i32 0, align 4 +@v = dso_local local_unnamed_addr global i32 0, align 4 + +; Function Attrs: nofree norecurse nosync nounwind uwtable +define dso_local void @f(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %n) local_unnamed_addr #0 { +entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !8 + %tobool.not = icmp eq i32 %0, 0 + %1 = load i32, i32* @u, align 4, !tbaa !8 + %inc1 = add nsw i32 %1, 1 + store i32 %inc1, i32* @u, align 4, !tbaa !8 + br i1 %tobool.not, label %if.end, label %cleanup.loopexit + +if.end: ; preds = %for.body + %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %2 = load i32, i32* %arrayidx3, align 4, !tbaa !8 + %tobool4.not = icmp eq i32 %2, 0 + br i1 %tobool4.not, label %for.inc, label %if.then5 + +if.then5: ; preds = %if.end + %3 = load i32, i32* @v, align 4, !tbaa !8 + %inc6 = add nsw i32 %3, 1 + store i32 %inc6, i32* @v, align 4, !tbaa !8 + br label %for.inc + +for.inc: ; preds = %if.end, %if.then5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %cleanup.loopexit, label %for.body, !llvm.loop !12 + +cleanup.loopexit: ; preds = %for.body, %for.inc + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %entry + ret void +} + +attributes #0 = { nofree norecurse nosync nounwind uwtable "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} +!llvm.ident = !{!7} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"branch-target-enforcement", i32 0} +!2 = !{i32 1, !"sign-return-address", i32 0} +!3 = !{i32 1, !"sign-return-address-all", i32 0} +!4 = !{i32 1, !"sign-return-address-with-bkey", i32 0} +!5 = !{i32 7, !"uwtable", i32 1} +!6 = !{i32 7, !"frame-pointer", i32 1} +!7 = !{!"clang version 14.0.0"} +!8 = !{!9, !9, i64 0} +!9 = !{!"int", !10, i64 0} +!10 = !{!"omnipotent char", !11, i64 0} +!11 = !{!"Simple C/C++ TBAA"} +!12 = distinct !{!12, !13} +!13 = !{!"llvm.loop.mustprogress"} \ No newline at end of file