Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -346,7 +346,8 @@ Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, bool TargetExecutesOncePerLoop, SinkAndHoistLICMFlags *LICMFlags = nullptr, - OptimizationRemarkEmitter *ORE = nullptr); + OptimizationRemarkEmitter *ORE = nullptr, + ScalarEvolution *SE = nullptr); /// Returns the comparison predicate used when expanding a min/max reduction. CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK); Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -169,6 +169,9 @@ SinkAndHoistLICMFlags &Flags); static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA, MemoryUse &MU); +static bool checkHeuristicCase(MemorySSA *MSSA, MemoryUse *MU, Loop *CurLoop, + LoadInst *LI, SinkAndHoistLICMFlags &Flags, + ScalarEvolution *SE); static Instruction *cloneInstructionInExitBlock( Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU); @@ -908,7 +911,7 @@ // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU, - true, &Flags, ORE) && + true, &Flags, ORE, SE) && worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, TLI, CurLoop, SafetyInfo, ORE, @@ -1160,7 +1163,8 @@ MemorySSAUpdater *MSSAU, bool TargetExecutesOncePerLoop, SinkAndHoistLICMFlags *Flags, - OptimizationRemarkEmitter *ORE) { + OptimizationRemarkEmitter *ORE, + ScalarEvolution *SE) { assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) && "Either AliasSetTracker or MemorySSA should be initialized."); @@ -1198,6 +1202,12 @@ else Invalidated = pointerInvalidatedByLoopWithMSSA( MSSA, cast(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags); + + if (Invalidated) + Invalidated = + checkHeuristicCase(MSSA, cast(MSSA->getMemoryAccess(LI)), + CurLoop, LI, *Flags, SE); + // Check loop-invariant address because this may also be a sinkable load // whose address is not necessarily loop-invariant. if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand())) @@ -2382,6 +2392,83 @@ return false; } +bool checkHeuristicCase(MemorySSA *MSSA, MemoryUse *MU, Loop *CurLoop, + LoadInst *LI, SinkAndHoistLICMFlags &Flags, + ScalarEvolution *SE) { + // Let's see an example with MemorySSA. + // + // loop.header: + // ; 2 = MemoryPhi({entry,liveOnEntry},{loop.body,1}) + // %i.0 = phi i32 [ 0, %entry ], [ %inc, %loop.body ] + // ... + // + // loop.body: + // ; MemoryUse(2) MayAlias + // %0 = load i32**, i32*** @a, align 8, !tbaa !8 + // %idxprom = zext i32 %i.0 to i64 + // %arrayidx = getelementptr inbounds i32*, i32** %0, i64 %idxprom + // ; 1 = MemoryDef(2) + // store i32* null, i32** %arrayidx, align 8, !tbaa !8 + // ... + // + // As we can see, there is a dependence with MayAlias between `%0 = load` of + // MemoryUse(2) and `store` of `1 = MemoryDef(2)`. However, it is WAR + // dependence at first iteration of the loop and there is no dependence at + // rest of iterations. In this case, we can hoist the '%0 = load` to + // preheader. + + // Considered only hoisting load instruction. + if (Flags.getIsSink()) + return true; + + // Find the memory defining access of load. + MemoryAccess *Source = MU->getDefiningAccess(); + + // Considered only MemoryPhi with loop. + auto *MemPhi = dyn_cast(Source); + if (!MemPhi) + return true; + + // Simply, this function considers only simple MemoryPhi which has only one + // MemoryDef as incoming value inside loop. + Instruction *DefInst = nullptr; + for (unsigned I = 0, E = MemPhi->getNumIncomingValues(); I != E; ++I) { + auto *IncAcc = MemPhi->getIncomingValue(I); + if (CurLoop->contains(IncAcc->getBlock())) { + if (auto *DefMA = dyn_cast(IncAcc)) { + if (DefInst) + return true; + else + DefInst = DefMA->getMemoryInst(); + } + } + } + + if (!DefInst) + return true; + + // Considered only store instruction as MemoryDef. + StoreInst *SI = dyn_cast(DefInst); + if (!SI) + return true; + + // Check the underlying object of store is load. + Value *Object = getUnderlyingObject(SI->getPointerOperand()); + if (Object != LI) + return true; + + // Check the ptrdiff between load and store is AddRecExpr. If it is + // AddRecExpr, we can say there is no dependance between load and store in the + // loop. + auto *LoadSCEV = SE->getSCEV(LI); + auto *StorePtrSCEV = SE->getSCEV(SI->getPointerOperand()); + auto *PtrDiffSCEV = SE->getMinusSCEV(StorePtrSCEV, LoadSCEV); + if (dyn_cast(PtrDiffSCEV)) + return false; + + return true; +} + /// Little predicate that returns true if the specified basic block is in /// a subloop of the current one, not the current one itself. /// Index: llvm/test/Transforms/LICM/heuristic-hoist-load.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LICM/heuristic-hoist-load.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='loop-mssa(licm)' -S < %s | FileCheck %s + +@a = dso_local local_unnamed_addr global i32** null, align 8 + +; The %0 should be hoisted. + +define void @foo(i32 %max) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32**, i32*** @a, align 8 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp sgt i32 [[I_0]], [[MAX:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32*, i32** [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32* null, i32** [[ARRAYIDX]], align 8, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp.not = icmp sgt i32 %i.0, %max + br i1 %cmp.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.cond + ret void + +for.body: ; preds = %for.cond + %0 = load i32**, i32*** @a, align 8, !tbaa !8 + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32*, i32** %0, i64 %idxprom + store i32* null, i32** %arrayidx, align 8, !tbaa !8 + %inc = add nuw nsw i32 %i.0, 1 + br label %for.cond, !llvm.loop !12 +} + + +!0 = !{i32 1, !"wchar_size", i32 4} +!8 = !{!9, !9, i64 0} +!9 = !{!"any pointer", !10, i64 0} +!10 = !{!"omnipotent char", !11, i64 0} +!11 = !{!"Simple C/C++ TBAA"} +!12 = distinct !{!12, !13} +!13 = !{!"llvm.loop.mustprogress"}