diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -109,6 +109,9 @@ AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", cl::Hidden, cl::init(true)); +static cl::opt RematDerivedAtUses("rs4gc-remat-derived-at-uses", + cl::Hidden, cl::init(true)); + /// The IR fed into RewriteStatepointsForGC may have had attributes and /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after @@ -2434,6 +2437,126 @@ } } +// Try to rematerialize derived pointers immediately before their uses +// (instead of rematerializing after every statepoint it is live through). +// This can be beneficial when derived pointer is live across many +// statepoints, but uses are rare. +static void rematerializeLiveValuesAtUses( + RematCandTy &RematerizationCandidates, + MutableArrayRef Records, + PointerToBaseTy &PointerToBase) { + if (!RematDerivedAtUses) + return; + + SmallVector LiveValuesToBeDeleted; + + LLVM_DEBUG(dbgs() << "Rematerialize derived pointers at uses, " + << "Num statepoints: " << Records.size() << '\n'); + + for (auto &It : RematerizationCandidates) { + Instruction *Cand = cast(It.first); + auto &Record = It.second; + + if (Record.Cost >= RematerializationThreshold) + continue; + + if (Cand->user_empty()) + continue; + + if (Cand->hasOneUse()) + if (auto *U = dyn_cast(Cand->getUniqueUndroppableUser())) + if (U->getParent() == Cand->getParent()) + continue; + + // Rematerialization before PHI nodes is not implemented. + if (llvm::any_of(Cand->users(), + [](const auto *U) { return isa(U); })) + continue; + + LLVM_DEBUG(dbgs() << "Trying cand " << *Cand << " ... "); + + // Count of rematerialization instructions we introduce is equal to number + // of candidate uses. + // Count of rematerialization instructions we eliminate is equal to number + // of statepoints it is live through. + // Consider transformation profitable if latter is greater than former + // (in other words, we create less than eliminate). + unsigned NumLiveStatepoints = llvm::count_if( + Records, [Cand](const auto &R) { return R.LiveSet.contains(Cand); }); + unsigned NumUses = Cand->getNumUses(); + + LLVM_DEBUG(dbgs() << "Num uses: " << NumUses << " Num live statepoints: " + << NumLiveStatepoints << " "); + + if (NumLiveStatepoints < NumUses) { + LLVM_DEBUG(dbgs() << "not profitable\n"); + continue; + } + + // If rematerialization is 'free', then favor rematerialization at + // uses as it generally shortens live ranges. + // TODO: Short (size ==1) chains only? + if (NumLiveStatepoints == NumUses && Record.Cost > 0) { + LLVM_DEBUG(dbgs() << "not profitable\n"); + continue; + } + + LLVM_DEBUG(dbgs() << "looks profitable\n"); + + // ChainToBase may contain another remat candidate (as a sub chain) which + // has been rewritten by now. Need to recollect chain to have up to date + // value. + // TODO: sort records in findRematerializationCandidates() in + // decreasing chain size order? + if (Record.ChainToBase.size() > 1) { + Record.ChainToBase.clear(); + findRematerializableChainToBasePointer(Record.ChainToBase, Cand); + } + + // Current rematerialization algorithm is very simple: we rematerialize + // immediately before EVERY use, even if there are several uses in same + // block or if use is local to Cand Def. The reason is that this allows + // us to avoid recomputing liveness without complicated analysis: + // - If we did not eliminate all uses of original Candidate, we do not + // know exaclty in what BBs it is still live. + // - If we rematerialize once per BB, we need to find proper insertion + // place (first use in block, but after Def) and analyze if there is + // statepoint between uses in the block. + while (!Cand->user_empty()) { + Instruction *UserI = cast(*Cand->user_begin()); + Instruction *RematChain = rematerializeChain( + Record.ChainToBase, UserI, Record.RootOfChain, PointerToBase[Cand]); + UserI->replaceUsesOfWith(Cand, RematChain); + PointerToBase[RematChain] = PointerToBase[Cand]; + } + LiveValuesToBeDeleted.push_back(Cand); + } + + LLVM_DEBUG(dbgs() << "Rematerialized " << LiveValuesToBeDeleted.size() + << " derived pointers\n"); + for (auto *Cand : LiveValuesToBeDeleted) { + assert(Cand->use_empty() && "Unexpected user remain"); + RematerizationCandidates.erase(Cand); + for (auto &R : Records) { + assert(!R.LiveSet.contains(Cand) || + R.LiveSet.contains(PointerToBase[Cand])); + R.LiveSet.remove(Cand); + } + } + + // Recollect not rematerialized chains - we might have rewritten + // their sub-chains. + if (!LiveValuesToBeDeleted.empty()) { + for (auto &P : RematerizationCandidates) { + auto &R = P.second; + if (R.ChainToBase.size() > 1) { + R.ChainToBase.clear(); + findRematerializableChainToBasePointer(R.ChainToBase, P.first); + } + } + } +} + // From the statepoint live set pick values that are cheaper to recompute then // to relocate. Remove this values from the live set, rematerialize them after // statepoint and record them in "Info" structure. Note that similar to @@ -2699,6 +2822,9 @@ // In order to reduce live set of statepoint we might choose to rematerialize // some values instead of relocating them. This is purely an optimization and // does not influence correctness. + // First try rematerialization at uses, then after statepoints. + rematerializeLiveValuesAtUses(RematerizationCandidates, Records, + PointerToBase); for (size_t i = 0; i < Records.size(); i++) rematerializeLiveValues(ToUpdate[i], Records[i], PointerToBase, RematerizationCandidates, TTI); diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers-at-uses.ll b/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers-at-uses.ll --- a/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers-at-uses.ll +++ b/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers-at-uses.ll @@ -12,21 +12,19 @@ ; CHECK-NEXT: [[DERIVED:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE:%.*]], i32 16 ; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE]]) ] ; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0) -; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED]], i32 16 ; CHECK-NEXT: br i1 [[COND:%.*]], label [[HERE:%.*]], label [[THERE:%.*]] ; CHECK: here: -; CHECK-NEXT: [[STATEPOINT_TOKEN3:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] -; CHECK-NEXT: [[BASE_RELOCATED4:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN3]], i32 0, i32 0) -; CHECK-NEXT: [[DERIVED_REMAT1:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED4]], i32 16 +; CHECK-NEXT: [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] +; CHECK-NEXT: [[BASE_RELOCATED2:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN1]], i32 0, i32 0) ; CHECK-NEXT: br label [[MERGE:%.*]] ; CHECK: there: -; CHECK-NEXT: [[STATEPOINT_TOKEN5:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] -; CHECK-NEXT: [[BASE_RELOCATED6:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN5]], i32 0, i32 0) -; CHECK-NEXT: [[DERIVED_REMAT2:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED6]], i32 16 +; CHECK-NEXT: [[STATEPOINT_TOKEN3:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] +; CHECK-NEXT: [[BASE_RELOCATED4:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN3]], i32 0, i32 0) ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: -; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[DERIVED_REMAT1]], [[HERE]] ], [ [[DERIVED_REMAT2]], [[THERE]] ] -; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DOT0]], align 4 +; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[BASE_RELOCATED2]], [[HERE]] ], [ [[BASE_RELOCATED4]], [[THERE]] ] +; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[DOT0]], i32 16 +; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DERIVED_REMAT]], align 4 ; CHECK-NEXT: ret i32 [[RET]] ; entry: @@ -141,27 +139,27 @@ ; CHECK-NEXT: [[DERIVED:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE:%.*]], i32 16 ; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE]]) ] ; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0) -; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED]], i32 16 ; CHECK-NEXT: br i1 [[COND:%.*]], label [[HERE:%.*]], label [[THERE:%.*]] ; CHECK: here: -; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT]]) +; CHECK-NEXT: [[DERIVED_REMAT3:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED]], i32 16 +; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT3]]) ; CHECK-NEXT: [[STATEPOINT_TOKEN4:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] ; CHECK-NEXT: [[BASE_RELOCATED5:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN4]], i32 0, i32 0) +; CHECK-NEXT: [[DERIVED_REMAT2:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED5]], i32 16 +; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT2]]) ; CHECK-NEXT: [[DERIVED_REMAT1:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED5]], i32 16 -; CHECK-NEXT: call void @use_obj(ptr addrspace(1) [[DERIVED_REMAT1]]) ; CHECK-NEXT: [[DUMMY:%.*]] = load i32, ptr addrspace(1) [[DERIVED_REMAT1]], align 4 ; CHECK-NEXT: [[STATEPOINT_TOKEN6:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED5]]) ] ; CHECK-NEXT: [[BASE_RELOCATED7:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN6]], i32 0, i32 0) -; CHECK-NEXT: [[DERIVED_REMAT2:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED7]], i32 16 ; CHECK-NEXT: br label [[MERGE:%.*]] ; CHECK: there: ; CHECK-NEXT: [[STATEPOINT_TOKEN8:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] ; CHECK-NEXT: [[BASE_RELOCATED9:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN8]], i32 0, i32 0) -; CHECK-NEXT: [[DERIVED_REMAT3:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE_RELOCATED9]], i32 16 ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: -; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[DERIVED_REMAT2]], [[HERE]] ], [ [[DERIVED_REMAT3]], [[THERE]] ] -; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DOT0]], align 4 +; CHECK-NEXT: [[DOT0:%.*]] = phi ptr addrspace(1) [ [[BASE_RELOCATED7]], [[HERE]] ], [ [[BASE_RELOCATED9]], [[THERE]] ] +; CHECK-NEXT: [[DERIVED_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[DOT0]], i32 16 +; CHECK-NEXT: [[RET:%.*]] = load i32, ptr addrspace(1) [[DERIVED_REMAT]], align 4 ; CHECK-NEXT: ret i32 [[RET]] ; entry: @@ -194,14 +192,14 @@ ; CHECK-NEXT: [[V0:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE:%.*]], i64 16 ; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(ptr addrspace(1) [[BASE]]), "gc-live"(ptr addrspace(1) [[BASE]]) ] ; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0) -; CHECK-NEXT: [[V0_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED]], i64 16 ; CHECK-NEXT: br i1 [[COND1:%.*]], label [[BLOCK3:%.*]], label [[COMMON_RET:%.*]] ; CHECK: block3: +; CHECK-NEXT: [[V0_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED]], i64 16 ; CHECK-NEXT: [[V4:%.*]] = getelementptr i8, ptr addrspace(1) [[V0_REMAT]], i64 70 -; CHECK-NEXT: [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(ptr addrspace(1) [[BASE_RELOCATED]]), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] -; CHECK-NEXT: [[BASE_RELOCATED3:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN2]], i32 0, i32 0) -; CHECK-NEXT: [[V0_REMAT1:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED3]], i64 16 -; CHECK-NEXT: [[V4_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[V0_REMAT1]], i64 70 +; CHECK-NEXT: [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(ptr addrspace(1) [[BASE_RELOCATED]]), "gc-live"(ptr addrspace(1) [[BASE_RELOCATED]]) ] +; CHECK-NEXT: [[BASE_RELOCATED2:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN1]], i32 0, i32 0) +; CHECK-NEXT: [[V0_REMAT_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE_RELOCATED2]], i64 16 +; CHECK-NEXT: [[V4_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[V0_REMAT_REMAT]], i64 70 ; CHECK-NEXT: [[V5:%.*]] = load atomic i8, ptr addrspace(1) [[V4_REMAT]] unordered, align 2 ; CHECK-NEXT: br label [[COMMON_RET]] ; CHECK: common.ret: diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll b/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll --- a/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll +++ b/llvm/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll @@ -298,13 +298,12 @@ ; CHECK-NEXT: [[PTR_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[BASE:%.*]], i32 15 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[DOT01:%.*]] = phi i32 addrspace(1)* [ [[PTR_GEP]], [[ENTRY:%.*]] ], [ [[PTR_GEP_REMAT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DOT0:%.*]] = phi i32 addrspace(1)* [ [[BASE]], [[ENTRY]] ], [ [[BASE_RELOCATED_CASTED:%.*]], [[LOOP]] ] -; CHECK-NEXT: call void @use_obj32(i32 addrspace(1)* [[DOT01]]) +; CHECK-NEXT: [[DOT0:%.*]] = phi i32 addrspace(1)* [ [[BASE]], [[ENTRY:%.*]] ], [ [[BASE_RELOCATED_CASTED:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_GEP_REMAT:%.*]] = getelementptr i32, i32 addrspace(1)* [[DOT0]], i32 15 +; CHECK-NEXT: call void @use_obj32(i32 addrspace(1)* [[PTR_GEP_REMAT]]) ; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "deopt"(), "gc-live"(i32 addrspace(1)* [[DOT0]]) ] ; CHECK-NEXT: [[BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 0, i32 0) ; CHECK-NEXT: [[BASE_RELOCATED_CASTED]] = bitcast i8 addrspace(1)* [[BASE_RELOCATED]] to i32 addrspace(1)* -; CHECK-NEXT: [[PTR_GEP_REMAT]] = getelementptr i32, i32 addrspace(1)* [[BASE_RELOCATED_CASTED]], i32 15 ; CHECK-NEXT: br label [[LOOP]] ; entry: