Index: llvm/include/llvm/Transforms/Utils/SSAUpdater.h =================================================================== --- llvm/include/llvm/Transforms/Utils/SSAUpdater.h +++ llvm/include/llvm/Transforms/Utils/SSAUpdater.h @@ -169,6 +169,10 @@ /// Called to update debug info associated with the instruction. virtual void updateDebugInfo(Instruction *I) const {} + + /// Used to indicate that we want to keep the store as is (since it was not + /// safe for sinking), by hoisting load only. + virtual bool shouldPreserveStore() const { return false; } }; } // end namespace llvm Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -117,6 +117,13 @@ cl::desc("Max num uses visited for identifying load " "invariance in loop using invariant start (default = 8)")); +// Hoist the load without sinking the store. Some targets (such as aarch64) +// will gain benefits out of this, since there will be better register +// promotion. +static cl::opt EnableLoadHoistOnly( + "licm-load-hoisting-only", cl::Hidden, cl::init(false), + cl::desc("Enable hoisting loads even if we can't sink the store.")); + // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to // address the same issue. This flag applies only when LICM uses MemorySSA @@ -496,6 +503,7 @@ if (Changed && SE) SE->forgetLoopDispositions(L); + return Changed; } @@ -1858,6 +1866,8 @@ bool UnorderedAtomic; AAMDNodes AATags; ICFLoopSafetyInfo &SafetyInfo; + // Hoist the load even we cannot sink the store. + bool HoistLoadOnly; // We're about to add a use of V in a loop exit block. Insert an LCSSA phi // (if legal) if doing so would add an out-of-loop use to an instruction @@ -1884,12 +1894,12 @@ SmallVectorImpl &MSSAIP, PredIteratorCache &PIC, MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl, int alignment, bool UnorderedAtomic, const AAMDNodes &AATags, - ICFLoopSafetyInfo &SafetyInfo) + ICFLoopSafetyInfo &SafetyInfo, bool HoistLoadOnly) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)), Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), - SafetyInfo(SafetyInfo) {} + SafetyInfo(SafetyInfo), HoistLoadOnly(HoistLoadOnly) {} bool isInstInList(Instruction *I, const SmallVectorImpl &) const override { @@ -1901,7 +1911,7 @@ return PointerMustAliases.count(Ptr); } - void doExtraRewritesBeforeFinalDeletion() override { + void insertStoresInLoopExitBlocks() { // Insert stores after in the loop exit blocks. Each exit block gets a // store of the live-out values that feed them. Since we've already told // the SSA updater about the defs in the loop and the preheader @@ -1935,10 +1945,17 @@ } } + void doExtraRewritesBeforeFinalDeletion() override { + if (!HoistLoadOnly) + insertStoresInLoopExitBlocks(); + } + void instructionDeleted(Instruction *I) const override { SafetyInfo.removeInstruction(I); MSSAU->removeMemoryAccess(I); } + + bool shouldPreserveStore() const override { return HoistLoadOnly; } }; bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L, @@ -2037,6 +2054,7 @@ bool DereferenceableInPH = false; bool SafeToInsertStore = false; + bool FoundLoadToPromote = false; SmallVector LoopUses; @@ -2103,6 +2121,7 @@ DereferenceableInPH = true; Alignment = std::max(Alignment, InstAlignment); } + FoundLoadToPromote = true; } else if (const StoreInst *Store = dyn_cast(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. @@ -2197,8 +2216,20 @@ } } - // If we've still failed to prove we can sink the store, give up. - if (!SafeToInsertStore) + // There is a special case where we move the load before the loop by + // preserving the store as is, since it may improve register promotion + // for some targets. This is controled by the EnableLoadHoistOnly flag. + bool ShouldTryLoadHoistingOnly = false; + // If we've still failed to prove we can sink the store, give up + // (unless the EnableLoadHoistOnly isn't true). + if (!SafeToInsertStore) { + if (DereferenceableInPH && EnableLoadHoistOnly) + ShouldTryLoadHoistingOnly = true; + else + return false; + } + + if (ShouldTryLoadHoistingOnly && !FoundLoadToPromote) return false; // Otherwise, this is safe to promote, lets do it! @@ -2223,7 +2254,7 @@ LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL, Alignment.value(), SawUnorderedAtomic, AATags, - *SafetyInfo); + *SafetyInfo, ShouldTryLoadHoistingOnly); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. Index: llvm/lib/Transforms/Utils/SSAUpdater.cpp =================================================================== --- llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -443,9 +443,16 @@ // Allow the client to do stuff before we start nuking things. doExtraRewritesBeforeFinalDeletion(); + // Hoist the loads only if we didn't manage to sink the stores safely. + bool PreserveTheStores = shouldPreserveStore(); + // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. for (Instruction *User : Insts) { + // Keep the store since the load is moved above only. + if (PreserveTheStores && isa(User)) + continue; + // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively Index: llvm/test/Transforms/LICM/AArch64/hoist-load-without-store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LICM/AArch64/hoist-load-without-store.ll @@ -0,0 +1,127 @@ +;; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +;; This tests the new ability of the LICM Pass to perform hoisting of loads +;; even if we can't sink the store. + +;; C reproducer: +;; int u, v; +;; +;; void f(int a[restrict], int b[restrict], int n) { +;; for (int i = 0; i < n; ++i) { +;; if (a[i]) { +;; ++u; +;; break; +;; } +;; ++u; +;; if (b[i]) +;; ++v; +;; } +;; } + +; RUN: opt -licm -licm-load-hoisting-only -mtriple aarch64-linux-gnu -S < %s | FileCheck %s + + +; ModuleID = './test.ll' +source_filename = "test.c" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux" + +@u = dso_local global i32 0, align 4 +@v = dso_local global i32 0, align 4 + +; Function Attrs: nofree norecurse nosync nounwind uwtable +define dso_local void @f(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %n) { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[U_PROMOTED:%.*]] = load i32, i32* @u, align 4 +; CHECK-NEXT: [[V_PROMOTED:%.*]] = load i32, i32* @v, align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INC64:%.*]] = phi i32 [ [[V_PROMOTED]], [[FOR_BODY_PREHEADER]] ], [ [[INC63:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[INC11:%.*]] = phi i32 [ [[U_PROMOTED]], [[FOR_BODY_PREHEADER]] ], [ [[INC1:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[INC1]] = add nsw i32 [[INC11]], 1 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[CLEANUP_LOOPEXIT:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TOBOOL4_NOT]], label [[FOR_INC]], label [[IF_THEN5:%.*]] +; CHECK: if.then5: +; CHECK-NEXT: [[INC6:%.*]] = add nsw i32 [[INC64]], 1 +; CHECK-NEXT: store i32 [[INC6]], i32* @v, align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC63]] = phi i32 [ [[INC6]], [[IF_THEN5]] ], [ [[INC64]], [[IF_END]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: cleanup.loopexit: +; CHECK-NEXT: [[INC12:%.*]] = phi i32 [ [[INC1]], [[FOR_INC]] ], [ [[INC1]], [[FOR_BODY]] ] +; CHECK-NEXT: store i32 [[INC12]], i32* @u, align 4 +; CHECK-NEXT: br label [[CLEANUP]] +; CHECK: cleanup: +; CHECK-NEXT: ret void +; +entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %0, 0 + %1 = load i32, i32* @u, align 4 + %inc1 = add nsw i32 %1, 1 + store i32 %inc1, i32* @u, align 4 + br i1 %tobool.not, label %if.end, label %cleanup.loopexit + +if.end: ; preds = %for.body + %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %2 = load i32, i32* %arrayidx3, align 4 + %tobool4.not = icmp eq i32 %2, 0 + br i1 %tobool4.not, label %for.inc, label %if.then5 + +if.then5: ; preds = %if.end + %3 = load i32, i32* @v, align 4 + %inc6 = add nsw i32 %3, 1 + store i32 %inc6, i32* @v, align 4 + br label %for.inc + +for.inc: ; preds = %if.end, %if.then5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %cleanup.loopexit, label %for.body, !llvm.loop !12 + +cleanup.loopexit: ; preds = %for.body, %for.inc + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %entry + ret void +} + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} +!llvm.ident = !{!7} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"branch-target-enforcement", i32 0} +!2 = !{i32 1, !"sign-return-address", i32 0} +!3 = !{i32 1, !"sign-return-address-all", i32 0} +!4 = !{i32 1, !"sign-return-address-with-bkey", i32 0} +!5 = !{i32 7, !"uwtable", i32 1} +!6 = !{i32 7, !"frame-pointer", i32 1} +!7 = !{!"clang version 14.0.0"} +!12 = distinct !{!12, !13} +!13 = !{!"llvm.loop.mustprogress"}