Index: lib/Transforms/Scalar/RewriteStatepointsForGC.cpp =================================================================== --- lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -63,6 +63,12 @@ RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden, cl::init(6)); +// Should we try to find a better place to remat? This is believed to always +// be a good idea (on x86 at least). The option is mostly useful as a +// performance debugging aid in identifying any resulting regressions. +static cl::opt UseRematSchedule("rs4gc-schedule-remat", cl::Hidden, + cl::init(true)); + #ifdef XDEBUG static bool ClobberNonLive = true; #else @@ -2125,13 +2131,114 @@ return Cost; } +/// Find a basic block which is dominated by all of the uses required by all +/// the instructions in Chain. Since elements in the remat chain are assumed +/// to be side effect free, this tells us where is legal to insert chain. +static BasicBlock *findDefScope(DominatorTree &DT, BasicBlock *UseBlock, + ArrayRef Chain) { + SmallSet DefBlocks; + for (Instruction *Link : Chain) + for (Value *V : Link->operands()) + if (auto *I = dyn_cast(V)) + DefBlocks.insert(I->getParent()); + + BasicBlock *Current = UseBlock; + while (true) { + if (DefBlocks.count(Current)) + return Current; + if (Current == &Current->getParent()->getEntryBlock()) + return Current; + Current = DT.getNode(Current)->getIDom()->getBlock(); + } + llvm_unreachable("use not dominated by defs?"); +} + +/// Find a good place to insert a rematerialization chain. The motivation for +/// this is that the safepoint we're rematerializing for might be buried in a +/// conditionally executed block or a loop nest. While having the +/// rematerializations in a rarely executed block would seem ideal, in practice, +/// its often better to materialize closer to the uses. In addition to the +/// solid chance that we can fold the rematerialized address computation into +/// the load (on x86 at least), if we materialize in the rare path and insert +/// phis around the loop nest, we can greatly increase register presure. The +/// resulting materializations are likely to be spilled and rematerlized (again) +/// in the fast path where actually needed, but this time without the knowledge +/// of what's actually being done (due to those darn phis). It's generally far +/// better to just compute the pointer via an LEA then introduce a load from a +/// stack slot. Note that the last part here only holds because a) we're +/// not keeping pointers in registers when lowering in the backend, and b) the +/// backend can't see back through a complicated nest of phis to realize an +/// instruction could be rematerialized late. +static Instruction *findInsertPoint(DominatorTree &DT, Value *Replacee, + ArrayRef Chain, + Instruction *TrivialIP) { + + // The heuristic here is to find the latest point which post dominates the + // safepoint and is before the first use we can. By pushing into successor + // blocks, we can greatly reduce the numbers of phis needed since we don't + // need to merge the remated value with anything from another path. This is + // specifically relying on the fact that we can remat the original value + // without side effects along any path, not merely the one which happened to + // go through the safepoint. + + // TODO: as mentioned in the comment above, this is somewhat x86 specific in + // it's assumptions. Using appropriate target hooks would be useful once + // we're trying to support other architectures. + + // TODO: it might be a good idea to restrict how aggressive this is using + // block frequency info or the size of the liveset at the safepoint + // (i.e. minimum register pressure). + + BasicBlock *Scope = findDefScope(DT, TrivialIP->getParent(), Chain); + + auto hasNextInstruction = [&](Instruction *I) { + if (!I->isTerminator()) + return true; + BasicBlock *nextBB = I->getParent()->getUniqueSuccessor(); + return nextBB && DT.dominates(Scope, nextBB); + }; + + auto nextInstruction = [&hasNextInstruction](Instruction *I) { + assert(hasNextInstruction(I) && + "first check if there is a next instruction!"); + if (I->isTerminator()) + return &I->getParent()->getUniqueSuccessor()->front(); + return &*++I->getIterator(); + }; + + auto hasUseOf = [](User *U, Value *V) { + for (unsigned i = 0, E = U->getNumOperands(); i != E; ++i) + if (U->getOperand(i) == V) + return true; + return false; + }; + Instruction *cursor = TrivialIP; + for (; hasNextInstruction(cursor); + cursor = nextInstruction(cursor)) { + // TODO: consider placing remats in each use, then tracking dominating def + // until leave merge chain or controling scope. + if (hasUseOf(cursor, Replacee)) + break; + + // Can't insert new defs past another statepoint without adjusting it's + // live range... but that's okay, this means we didn't find any uses which + // needed handling. That implies the remat here is trivially dead + // anyways. We'll insert it for simplicity, but InstSimpilfy will kill it. + if (isStatepoint(cursor)) + break; + } + return cursor; + +} + // From the statepoint live set pick values that are cheaper to recompute then // to relocate. Remove this values from the live set, rematerialize them after // statepoint and record them in "Info" structure. Note that similar to // relocated values we don't do any user adjustments here. static void rematerializeLiveValues(CallSite CS, PartiallyConstructedSafepointRecord &Info, - TargetTransformInfo &TTI) { + TargetTransformInfo &TTI, + DominatorTree &DT) { const unsigned int ChainLengthThreshold = 10; // Record values we are going to delete from this statepoint live set. @@ -2179,7 +2286,7 @@ // Utility function which clones all instructions from "ChainToBase" // and inserts them before "InsertBefore". Returns rematerialized value // which should be used after statepoint. - auto rematerializeChain = [&ChainToBase](Instruction *InsertBefore) { + auto rematerializeChain = [&](Instruction *InsertBefore) { Instruction *LastClonedValue = nullptr; Instruction *LastValue = nullptr; for (Instruction *Instr: ChainToBase) { @@ -2221,6 +2328,8 @@ if (CS.isCall()) { Instruction *InsertBefore = CS.getInstruction()->getNextNode(); assert(InsertBefore); + if (UseRematSchedule) + InsertBefore = findInsertPoint(DT, LiveValue, ChainToBase, InsertBefore); Instruction *RematerializedValue = rematerializeChain(InsertBefore); Info.RematerializedValues[RematerializedValue] = LiveValue; } else { @@ -2228,8 +2337,14 @@ Instruction *NormalInsertBefore = &*Invoke->getNormalDest()->getFirstInsertionPt(); + if (UseRematSchedule) + NormalInsertBefore = findInsertPoint(DT, LiveValue, ChainToBase, + NormalInsertBefore); Instruction *UnwindInsertBefore = &*Invoke->getUnwindDest()->getFirstInsertionPt(); + if (UseRematSchedule) + UnwindInsertBefore = findInsertPoint(DT, LiveValue, ChainToBase, + UnwindInsertBefore); Instruction *NormalRematerializedValue = rematerializeChain(NormalInsertBefore); @@ -2400,7 +2515,7 @@ // some values instead of relocating them. This is purely an optimization and // does not influence correctness. for (size_t i = 0; i < Records.size(); i++) - rematerializeLiveValues(ToUpdate[i], Records[i], TTI); + rematerializeLiveValues(ToUpdate[i], Records[i], TTI, DT); // We need this to safely RAUW and delete call or invoke return values that // may themselves be live over a statepoint. For details, please see usage in Index: test/Transforms/RewriteStatepointsForGC/remat-schedule.ll =================================================================== --- test/Transforms/RewriteStatepointsForGC/remat-schedule.ll +++ test/Transforms/RewriteStatepointsForGC/remat-schedule.ll @@ -0,0 +1,119 @@ +; RUN: opt %s -rewrite-statepoints-for-gc -S -rs4gc-schedule-remat=1 -rs4gc-use-deopt-bundles 2>&1 | FileCheck %s + +declare void @use_obj16(i16 addrspace(1)*) "gc-leaf-function" +declare void @use_obj32(i32 addrspace(1)*) "gc-leaf-function" +declare void @use_obj64(i64 addrspace(1)*) "gc-leaf-function" +declare void @do_safepoint() + +define void @test(i32 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: @test +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %base, i32 15 + ; CHECK: getelementptr i32, i32 addrspace(1)* %base, i32 15 + call void @do_safepoint() ["deopt" ()] + ; CHECK: gc.relocate + ; CHECK: bitcast + ; CHECK: br label %next + br label %next +next: + ; CHECK: call + ; CHECK: getelementptr i32, i32 addrspace(1)* %base.relocated.casted, i32 15 + ; CHECK: call + call void @use_obj32(i32 addrspace(1)* %base) + call void @use_obj32(i32 addrspace(1)* %ptr) + ret void +} + +; Only need a PHI for the base pointer in this loop. The derived pointer +; can be nicely remated +define void @test2(i32 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: test2 +entry: + %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15 + ; CHECK: getelementptr + br label %loop + +loop: + ; CHECK: phi i32 addrspace(1)* [ %base, %entry ], [ %base.relocated.casted, %loop ] + ; CHECK: %ptr.gep.remat = getelementptr + call void @use_obj32(i32 addrspace(1)* %ptr.gep) + call void @do_safepoint() ["deopt" ()] + ; CHECK: gc.relocate + br label %loop +} + +@G = external global i32 + +; Can sink the remat into the bottom of the loop which lets +; it merge with the load +define i8 @test3(i8 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: test3 +entry: + %gep = getelementptr i8, i8 addrspace(1)* %base, i64 8 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %merge] + %iv.next = add i32 %iv, 1 + %test = load volatile i32, i32* @G + %sp_flag = icmp eq i32 %test, 0 + br i1 %sp_flag, label %safepoint, label %merge + +safepoint: + call void @do_safepoint() [ "deopt" () ] + br label %merge + +merge: +; CHECK-LABEL: merge: +; CHECK: phi i8 addrspace(1)* +; CHECK: %gep.remat = getelementptr + %cnd = icmp ne i32 10000, %iv + br i1 %cnd, label %loop, label %exit + +exit: +; CHECK-LABEL: exit: +; CHECK: load i8, i8 addrspace(1)* %gep.remat + %load = load i8, i8 addrspace(1)* %gep + ret i8 %load +} + +; In this case, we can't directly push the remat around the loop, but since we +; got it to the end of the loop (i.e. no phi for merge), instcombine has no trouble +; finishing the job for us. +define i8 @test4(i8 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: test4 +entry: + %gep = getelementptr i8, i8 addrspace(1)* %base, i64 8 + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: phi i8 addrspace(1)* [ %gep, %entry ], [ %gep.remat, %merge ] +; CHECK: load i8, i8 addrspace(1)* + %iv = phi i32 [0, %entry], [%iv.next, %merge] + %iv.next = add i32 %iv, 1 + %load = load i8, i8 addrspace(1)* %gep + %test = load volatile i32, i32* @G + %sp_flag = icmp eq i32 %test, 0 + br i1 %sp_flag, label %safepoint, label %merge + +safepoint: + call void @do_safepoint() [ "deopt" () ] + br label %merge + +merge: +; CHECK-LABEL: merge: +; CHECK: phi i8 addrspace(1)* +; CHECK: %gep.remat = getelementptr + %cnd = icmp ne i32 10000, %iv + br i1 %cnd, label %loop, label %exit + +exit: +; CHECK-LABEL: exit: + ret i8 %load +} + + + + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) Index: test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll =================================================================== --- test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll +++ test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll @@ -1,4 +1,4 @@ -; RUN: opt %s -rewrite-statepoints-for-gc -S 2>&1 | FileCheck %s +; RUN: opt %s -rewrite-statepoints-for-gc -S -rs4gc-schedule-remat=0 2>&1 | FileCheck %s declare void @use_obj16(i16 addrspace(1)*) declare void @use_obj32(i32 addrspace(1)*)