Index: llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -36,6 +36,7 @@ #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -57,6 +58,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include @@ -164,9 +166,14 @@ class LoadEliminationForLoop { public: LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, - DominatorTree *DT, BlockFrequencyInfo *BFI, - ProfileSummaryInfo* PSI) - : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {} + DominatorTree *DT, AssumptionCache *AC, + MemorySSA *MSSA, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) + : L(L), LI(LI), LAI(LAI), DT(DT), AC(AC), BFI(BFI), PSI(PSI), + PSE(LAI.getPSE()) { + if (MSSA) + MSSAU = std::make_unique(MSSA); + } /// Look through the loop-carried and loop-independent dependences in /// this loop and find store->load dependences. @@ -535,11 +542,6 @@ return false; } - if (!L->isLoopSimplifyForm()) { - LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form"); - return false; - } - if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { if (LAI.hasConvergentOp()) { LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with " @@ -561,6 +563,10 @@ // Point of no-return, start the transformation. First, version the loop // if necessary. + // TODO: Is simplification really necessary here? + if (!L->isLoopSimplifyForm()) + simplifyLoop(L, DT, LI, PSE.getSE(), AC, MSSAU.get(), + /*PreserveLCSSA*/ false); LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); LV.setAliasChecks(std::move(Checks)); @@ -568,6 +574,11 @@ LV.versionLoop(); } + // Make sure the loop is in simplify form. We'll need preheader further. + if (!L->isLoopSimplifyForm()) + simplifyLoop(L, DT, LI, PSE.getSE(), AC, MSSAU.get(), + /*PreserveLCSSA*/ false); + // Next, propagate the value stored by the store to the users of the load. // Also for the first iteration, generate the initial value of the load. SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), @@ -590,16 +601,20 @@ LoopInfo *LI; const LoopAccessInfo &LAI; DominatorTree *DT; + AssumptionCache *AC; BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; PredicatedScalarEvolution PSE; + std::unique_ptr MSSAU; }; } // end anonymous namespace static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + ScalarEvolution &SE, AssumptionCache *AC, + MemorySSA *MSSA, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, function_ref GetLAI) { // Build up a worklist of inner-loops to transform to avoid iterator // invalidation. @@ -618,7 +633,7 @@ bool Changed = false; for (Loop *L : Worklist) { // The actual work is performed by LoadEliminationForLoop. - LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI); + LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, AC, MSSA, BFI, PSI); Changed |= LEL.processLoop(); } return Changed; @@ -647,11 +662,12 @@ auto *BFI = (PSI && PSI->hasProfileSummary()) ? &getAnalysis().getBFI() : nullptr; + auto &SE = getAnalysis().getSE(); // Process each loop nest in the function. - return eliminateLoadsAcrossLoops( - F, LI, DT, BFI, PSI, - [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); }); + return eliminateLoadsAcrossLoops(F, LI, DT, SE, nullptr, nullptr, BFI, PSI, + [&LAA](Loop & L)->const LoopAccessInfo & + { return LAA.getInfo(&L); }); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -679,7 +695,6 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) @@ -706,8 +721,9 @@ : nullptr; auto &LAM = AM.getResult(F).getManager(); - bool Changed = eliminateLoadsAcrossLoops( - F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & { + bool Changed = + eliminateLoadsAcrossLoops(F, LI, DT, SE, &AC, MSSA, BFI, PSI, + [&](Loop & L)->const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult(L, AR); }); Index: llvm/test/Transforms/LoopLoadElim/cond-load.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/cond-load.ll +++ llvm/test/Transforms/LoopLoadElim/cond-load.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -loop-load-elim < %s | FileCheck %s +; RUN: opt -S -passes=loop-load-elim < %s | FileCheck %s ; We can't hoist conditional loads to the preheader for the initial value. ; E.g. in the loop below we'd access array[-1] if we did: Index: llvm/test/Transforms/LoopLoadElim/def-store-before-load.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/def-store-before-load.ll +++ llvm/test/Transforms/LoopLoadElim/def-store-before-load.ll @@ -1,4 +1,5 @@ ; RUN: opt -loop-load-elim -S < %s | FileCheck %s +; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s ; No loop-carried forwarding: The intervening store to A[i] kills the stored ; value from the previous iteration. Index: llvm/test/Transforms/LoopLoadElim/loop-simplify-dep.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/loop-simplify-dep.ll +++ llvm/test/Transforms/LoopLoadElim/loop-simplify-dep.ll @@ -1,4 +1,5 @@ ; RUN: opt -loop-load-elim -S < %s | FileCheck %s +; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s ; Make sure we create a preheader if we dont' have one. Index: llvm/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll +++ llvm/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll @@ -1,5 +1,5 @@ ; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s - +; RUN: opt -aa-pipeline=basic-aa -passes='require,loop-load-elim' -S < %s | FileCheck %s ; In this case the later store forward to the load: ; ; for (unsigned i = 0; i < 100; i++) { Index: llvm/test/Transforms/LoopLoadElim/new-pm-preheader-problem.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/new-pm-preheader-problem.ll +++ llvm/test/Transforms/LoopLoadElim/new-pm-preheader-problem.ll @@ -1,3 +1,4 @@ +; RUN: opt -loop-load-elim -S < %s | FileCheck %s ; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s ; Make sure it doesn't crash in new pass manager due to missing preheader. target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @@ -7,12 +8,19 @@ br i1 %C, label %for.body, label %for.end ; CHECK: test +; CHECK: for.body.preheader: +; CHECK-NEXT: %load_initial = load i32, i32* %A +; CHECK-NEXT: br label %for.body + +; CHECK: for.body: for.body: +; CHECK-NEXT: %store_forwarded = phi i32 [ %load_initial, %for.body.preheader ], [ %add, %for.body ] %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv %load = load i32, i32* %arrayidx, align 4 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv %load_1 = load i32, i32* %arrayidx2, align 4 +; CHECK: %add = add i32 %load_1, %store_forwarded %add = add i32 %load_1, %load %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next Index: llvm/test/Transforms/LoopLoadElim/non-consecutive.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/non-consecutive.ll +++ llvm/test/Transforms/LoopLoadElim/non-consecutive.ll @@ -1,4 +1,5 @@ ; RUN: opt -loop-load-elim -S < %s | FileCheck %s +; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s ; The accesses to A are independent here but LAA reports it as a loop-carried ; forward dependence. Check that we don't perform st->ld forwarding between Index: llvm/test/Transforms/LoopLoadElim/opt-size.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/opt-size.ll +++ llvm/test/Transforms/LoopLoadElim/opt-size.ll @@ -1,6 +1,8 @@ ; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s ; RUN: opt -basicaa -loop-load-elim -pgso -S < %s | FileCheck %s -check-prefix=PGSO ; RUN: opt -basicaa -loop-load-elim -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO +; RUN: opt -aa-pipeline=basic-aa -passes='require,loop-load-elim' -S < %s | FileCheck %s +; TODO: Enable tests with PGSO in new pipeline. ; When optimizing for size don't eliminate in this loop because the loop would ; have to be versioned first because A and C may alias. Index: llvm/test/Transforms/LoopLoadElim/type-mismatch.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/type-mismatch.ll +++ llvm/test/Transforms/LoopLoadElim/type-mismatch.ll @@ -1,4 +1,5 @@ ; RUN: opt -loop-load-elim -S < %s | FileCheck %s +; RUN: opt -passes=loop-load-elim -S < %s | FileCheck %s ; Don't crash if the store and the load use different types. ; Index: llvm/test/Transforms/LoopLoadElim/unknown-dep.ll =================================================================== --- llvm/test/Transforms/LoopLoadElim/unknown-dep.ll +++ llvm/test/Transforms/LoopLoadElim/unknown-dep.ll @@ -1,5 +1,5 @@ ; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s - +; RUN: opt -aa-pipeline=basic-aa -passes='require,loop-load-elim' -S < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Give up in the presence of unknown deps. Here, the different strides result