Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include using namespace llvm; @@ -1417,6 +1418,133 @@ return true; } +// Comparator and maps for forward user analysis. +struct InstCmp { + bool operator()(const Instruction *L, const Instruction *R) const { + return L->comesBefore(R); + }; +}; +// Instruction to Dest & Src ModRefInfo +using InstModRefMap = + std::map, InstCmp>; +using BasicBlockInstModRefMap = DenseMap; + +bool IsDef(Instruction *UI, std::optional AllocaSize, + const DataLayout &DL) { + if (IntrinsicInst *II = dyn_cast(UI)) { + if (II->isLifetimeStartOrEnd()) { + // We treat a call to a lifetime intrinsic that covers the entire + // alloca as a definition, since both llvm.lifetime.start and + // llvm.lifetime.end intrinsics conceptually fill all the bytes of + // the alloca with an undefined value. We also note these + // locations of these intrinsic calls so that we can delete them + // later if the optimization succeeds. + int64_t Size = cast(II->getArgOperand(0))->getSExtValue(); + + if (Size < 0 || uint64_t(Size) == AllocaSize) { + return true; + } + } else if (MemIntrinsic *MI = dyn_cast(II)) { + if (ConstantInt *CI = dyn_cast(MI->getLength())) { + if (CI->getZExtValue() == (*AllocaSize).getFixedValue()) + return true; + } + } + } else if (StoreInst *SI = dyn_cast(UI)) { + // Stores that overwrite all bytes of the alloca are definitions. + if (DL.getTypeStoreSize(SI->getValueOperand()->getType()) == + (*AllocaSize).getFixedValue()) + return true; + } + return false; +} + +// Helper for stack-move optimization, states to represent the previous +// modifiers. +enum ModState { NoMod, SrcMod, DestMod, BothMod }; +// Represent the ModState change before and after the BB. +using BasicBlockModMap = std::set>; + +bool checkModRefConflict(BasicBlockInstModRefMap BBInstModRefMap, + AllocaInst *AI, Instruction *Store, + const DataLayout &DL) { + + std::optional AllocaSize = AI->getAllocationSize(DL); + BasicBlock *AllocaBB = AI->getParent(); + SmallVector, 8> Worklist; + BasicBlockModMap BBModMap; + Worklist.push_back({AllocaBB, ModState::NoMod}); + + while (!Worklist.empty()) { + auto &[BB, BeforeMS] = Worklist.back(); + Worklist.pop_back(); + ModState AfterMS = BeforeMS; + + // compute the modref effect for this BB + for (auto &[I, MR] : BBInstModRefMap[BB]) { + auto &[DestMR, SrcMR] = MR; + // modref conlict + // TODO: partial lifetime should be only Mod, but now ModRef. + if (IntrinsicInst *II = dyn_cast(I); + II && II->isLifetimeStartOrEnd()) { + int64_t Size = cast(II->getArgOperand(0))->getSExtValue(); + if (Size < 0 || uint64_t(Size) == AllocaSize) + continue; + } + + // lifetime intrinsics is modref, is this correct? + if (((AfterMS & ModState::SrcMod) && isRefSet(DestMR)) || + ((AfterMS & ModState::DestMod) && isRefSet(SrcMR))) { + + // also, we can ignore the sync pattern i.e. copy from dest/src to + // dest/src + // TODO: check this is full size or not. + if (IntrinsicInst *II = dyn_cast(I); + (!II || (II && !II->isLifetimeStartOrEnd()))) { + LLVM_DEBUG( + dbgs() + << "Stack Move: Detected ModRef conflict inside the function: " + << I->getParent()->getParent()->getName() + << "\n Instruction: " << I << "\n"); + I->print(dbgs()); + return false; + } + } + + // For the Store, we need to care about M + // TODO: this might be more instructions + if (I == Store) { + AfterMS = ModState::NoMod; + } else if (IsDef(I, AllocaSize, DL)) { // if the instruction is Def, other one's mod is erased + if (isModSet(DestMR)) + AfterMS = ModState::DestMod; + else if (isModSet(SrcMR)) + AfterMS = ModState::SrcMod; + } else if (isModSet(DestMR)) { + // FIXME: correctly define or operator + if (AfterMS == ModState::SrcMod) + AfterMS = ModState::BothMod; + else + AfterMS = ModState::DestMod; + } else if (isModSet(SrcMR)) { + if (AfterMS == ModState::DestMod) + AfterMS = ModState::BothMod; + else + AfterMS = ModState::SrcMod; + } + } + BBModMap.insert({BB, BeforeMS}); + + for (BasicBlock *Succ : successors(BB)) { + if (BBModMap.find({Succ, AfterMS}) != BBModMap.end()) + continue; + else + Worklist.push_back({Succ, AfterMS}); + } + } + return true; +} + using InsertionPt = PointerUnion; /// Find the nearest Instruction or BasicBlock that dominates both I1 and /// I2. @@ -1578,19 +1706,11 @@ if (PDom) PDom = findNearestCommonPostDominator(PDom, UI, PDT); } - if (UI->isLifetimeStartOrEnd()) { - // We note the locations of these intrinsic calls so that we can - // delete them later if the optimization succeeds, this is safe - // since both llvm.lifetime.start and llvm.lifetime.end intrinsics - // practically fill all the bytes of the alloca with an undefined - // value, although conceptually marked as alive/dead. - int64_t Size = cast(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; - } - } - if (UI->hasMetadata(LLVMContext::MD_noalias)) + + if (auto *II = dyn_cast(UI); + II && II->isLifetimeStartOrEnd()) + LifetimeMarkers.push_back(II); + else if (UI->hasMetadata(LLVMContext::MD_noalias)) NoAliasInstrs.insert(UI); if (!ModRefCallback(UI)) return false; @@ -1604,66 +1724,27 @@ // Check that dest has no Mod/Ref, from the alloca to the Store, except full // size lifetime intrinsics. And collect modref inst for the reachability // check. - ModRefInfo DestModRef = ModRefInfo::NoModRef; MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size)); - SmallVector ReachabilityWorklist; + BasicBlockInstModRefMap BBInstModRefMap; + auto DestModRefCallback = [&](Instruction *UI) -> bool { - // We don't care about the store itself. - if (UI == Store) - return true; ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc); - DestModRef |= Res; - if (isModOrRefSet(Res)) { - // Instructions reachability checks. - // FIXME: adding the Instruction version isPotentiallyReachableFromMany on - // lib/Analysis/CFG.cpp (currently only for BasicBlocks) might be helpful. - if (UI->getParent() == Store->getParent()) { - // The same block case is special because it's the only time we're - // looking within a single block to see which instruction comes first. - // Once we start looking at multiple blocks, the first instruction of - // the block is reachable, so we only need to determine reachability - // between whole blocks. - BasicBlock *BB = UI->getParent(); - - // If A comes before B, then B is definitively reachable from A. - if (UI->comesBefore(Store)) - return false; - - // If the user's parent block is entry, no predecessor exists. - if (BB->isEntryBlock()) - return true; - - // Otherwise, continue doing the normal per-BB CFG walk. - ReachabilityWorklist.append(succ_begin(BB), succ_end(BB)); - } else { - ReachabilityWorklist.push_back(UI->getParent()); - } - } + BBInstModRefMap[UI->getParent()].insert({UI, {Res, ModRefInfo::NoModRef}}); return true; }; if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) return false; - // Bailout if Dest may have any ModRef before Store. - if (!ReachabilityWorklist.empty() && - isPotentiallyReachableFromMany(ReachabilityWorklist, Store->getParent(), - nullptr, DT, nullptr)) - return false; - // Check that, from after the Load to the end of the BB, - // - if the dest has any Mod, src has no Ref, and - // - if the dest has any Ref, src has no Mod except full-sized lifetimes. MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size)); auto SrcModRefCallback = [&](Instruction *UI) -> bool { - // Any ModRef post-dominated by Load doesn't matter, also Load and Store - // themselves can be ignored. - if (PDT->dominates(Load, UI) || UI == Load || UI == Store) - return true; ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc); - if ((isModSet(DestModRef) && isRefSet(Res)) || - (isRefSet(DestModRef) && isModSet(Res))) - return false; + if (BBInstModRefMap.count(UI->getParent())) + BBInstModRefMap[UI->getParent()][UI].second = Res; + else + BBInstModRefMap[UI->getParent()].insert( + {UI, {ModRefInfo::NoModRef, Res}}); return true; }; @@ -1671,6 +1752,9 @@ if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) return false; + if (!checkModRefConflict(BBInstModRefMap, SrcAlloca, Store, DL)) + return false; + // We can do the transformation. First, align the allocas appropriately. SrcAlloca->setAlignment( std::max(SrcAlloca->getAlign(), DestAlloca->getAlign())); Index: llvm/test/Transforms/MemCpyOpt/stack-move.ll =================================================================== --- llvm/test/Transforms/MemCpyOpt/stack-move.ll +++ llvm/test/Transforms/MemCpyOpt/stack-move.ll @@ -599,7 +599,7 @@ ; CHECK-LABEL: define void @multi_bb_dom_test1 ; CHECK-SAME: (i1 [[B:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: br i1 [[B]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 @@ -608,12 +608,11 @@ ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 40, i32 50, i32 60 }, ptr [[SRC]], align 4 ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: ret void ; CHECK: unr: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br label [[BB2]] ; %src = alloca %struct.Foo, align 4 @@ -876,17 +875,12 @@ define void @memcpy_is_def() { ; CHECK-LABEL: define void @memcpy_is_def() { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SRC]], ptr align 4 [[DEST]], i64 12, i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca %struct.Foo, align 4 @@ -910,17 +904,13 @@ define void @memset_is_def() { ; CHECK-LABEL: define void @memset_is_def() { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[SRC]], i8 42, i64 12, i1 false) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca %struct.Foo, align 4 @@ -944,18 +934,13 @@ define void @store_is_def() { ; CHECK-LABEL: define void @store_is_def() { ; CHECK-NEXT: [[SRC:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[SRC]]) ; CHECK-NEXT: store i32 42, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: store i32 [[TMP2]], ptr [[DEST]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: store i32 64, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nocapture [[DEST]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca i32, align 4 @@ -1514,22 +1499,18 @@ ; CHECK-LABEL: define void @multi_bb_mod_ref_conflict ; CHECK-SAME: (i1 [[B:%.*]]) { ; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false) ; CHECK-NEXT: br i1 [[B]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[DEST]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr nocapture noundef [[SRC]]) ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[SRC]]) -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr nocapture [[DEST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[SRC]]) ; CHECK-NEXT: ret void ; %src = alloca %struct.Foo, align 4