Index: lib/CodeGen/SelectionDAG/StatepointLowering.h =================================================================== --- lib/CodeGen/SelectionDAG/StatepointLowering.h +++ lib/CodeGen/SelectionDAG/StatepointLowering.h @@ -43,8 +43,8 @@ /// Returns the spill location of a value incoming to the current /// statepoint. Will return SDValue() if this value hasn't been - /// spilled. Otherwise, the value has already been spilled and no - /// further action is required by the caller. + /// assigned a spill slot. Note that the value may have been assigned a + /// spill slot, but not yet actually spilled. SDValue getLocation(SDValue val) { if (!Locations.count(val)) return SDValue(); Index: lib/CodeGen/SelectionDAG/StatepointLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -114,8 +114,12 @@ } /// Utility function for reservePreviousStackSlotForValue. Tries to find -/// stack slot index to which we have spilled value for previous statepoints. -/// LookUpDepth specifies maximum DFS depth this function is allowed to look. +/// stack slot index which is obviously profitable to use for this value. +/// Looks for values which spills of the same value, values generated from +/// reloads, and simple updates to previously spilled values. LookUpDepth +/// specifies maximum DFS depth this function is allowed to look. This function +/// doesn't worry about conflicting spill slots, that's handled by it's +/// caller. static Optional findPreviousSpillSlot(const Value *Val, SelectionDAGBuilder &Builder, int LookUpDepth) { @@ -137,11 +141,38 @@ return It->second; } - // Look through bitcast instructions. + // Look through bitcast instructions. We can do this since bitcast don't + // change the value in the slot and we can store both the original and + // bitcasted value in the same slot. if (const BitCastInst *Cast = dyn_cast(Val)) { return findPreviousSpillSlot(Cast->getOperand(0), Builder, LookUpDepth - 1); } + // We can place a new value in an existing stack slot if we can be confident + // that the original value won't be live over the statepoint and thus + // preferrable to put in that slot. Note that hasOneUse check is an + // optimization heuristic, not a correctness check. Conflicting reservations + // are handled explicitly in our caller. Single use updates have a + // particularly nice property on x86 in that many of them can be folded to + // updates against memory if the same spill slot can be used on both sides. + // This really helps cases like: + // statepoint @foo() (p) + // p2 = gep p+1 + // statepoint @bar() (p2) <-- where p is dead here + // Note that the spill slot reuse works for deopt state as well, but we don't + // get the nice update in place property since the data dependency in on p, + // not reload(spill(p)) and the register allocator doesn't know that the + // spill slot contains the value of (p). This is probably fixable. + if (Val->hasOneUse()) { + if (auto *GEP = dyn_cast(Val)) + return findPreviousSpillSlot(GEP->getPointerOperand(), Builder, + LookUpDepth - 1); + if (auto *BOp = dyn_cast(Val)) + if (isa(BOp->getOperand(1))) + return findPreviousSpillSlot(BOp->getOperand(0), Builder, + LookUpDepth - 1); + } + // Look through phi nodes // All incoming values should have same known stack slot, otherwise result // is unknown. @@ -173,22 +204,6 @@ // in example we still need to emit store, but instead of any location // we need to use special "preferred" location. - // TODO: handle simple updates. If a value is modified and the original - // value is no longer live, it would be nice to put the modified value in the - // same slot. This allows folding of the memory accesses for some - // instructions types (like an increment). - // statepoint (i) - // i1 = i+1 - // statepoint (i1) - // However we need to be careful for cases like this: - // statepoint(i) - // i1 = i+1 - // statepoint(i, i1) - // Here we want to reserve spill slot for 'i', but not for 'i+1'. If we just - // put handling of simple modifications in this function like it's done - // for bitcasts we might end up reserving i's slot for 'i+1' because order in - // which we visit values is unspecified. - // Don't know any information about this instruction return Optional(); } @@ -197,7 +212,9 @@ /// statepoint spilling. If we can find a spill slot for the incoming value, /// mark that slot as allocated, and reuse the same slot for this safepoint. /// This helps to avoid series of loads and stores that only serve to resuffle -/// values on the stack between calls. +/// values on the stack between calls. We may also be able to reuse an +/// existing slot if we can trivially tell the original value isn't live at +/// this statepoint but a value computed from it is. static void reservePreviousStackSlotForValue(const Value *IncomingValue, SelectionDAGBuilder &Builder) { @@ -230,11 +247,6 @@ std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr); if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) { // stack slot already assigned to someone else, can't use it! - // TODO: currently we reserve space for gc arguments after doing - // normal allocation for deopt arguments. We should reserve for - // _all_ deopt and gc arguments, then start allocating. This - // will prevent some moves being inserted when vm state changes, - // but gc state doesn't between two calls. return; } // Reserve this stack slot @@ -399,25 +411,29 @@ SelectionDAGBuilder &Builder) { SDValue Loc = Builder.StatepointLowering.getLocation(Incoming); - // Emit new store if we didn't do it for this ptr before + // If we couldn't find a profitable stack slot previously, assign one. if (!Loc.getNode()) { Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(), Builder); assert(isa(Loc)); - int Index = cast(Loc)->getIndex(); - // We use TargetFrameIndex so that isel will not select it into LEA - Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType()); - - // TODO: We can create TokenFactor node instead of - // chaining stores one after another, this may allow - // a bit more optimal scheduling for them - Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc, - MachinePointerInfo::getFixedStack(Index), - false, false, 0); - Builder.StatepointLowering.setLocation(Incoming, Loc); } + // Now spill the value. When we have multiple values which map the same + // stack slot this may result in redundant spills, but we can generally rely + // on the backend to clean these up for us and don't need to be fancy here. + + int Index = cast(Loc)->getIndex(); + // We use TargetFrameIndex so that isel will not select it into LEA + Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType()); + + // TODO: We can create TokenFactor node instead of + // chaining stores one after another, this may allow + // a bit more optimal scheduling for them + Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc, + MachinePointerInfo::getFixedStack(Index), + false, false, 0); + assert(Loc.getNode()); return std::make_pair(Loc, Chain); } Index: test/CodeGen/X86/statepoint-stack-usage.ll =================================================================== --- test/CodeGen/X86/statepoint-stack-usage.ll +++ test/CodeGen/X86/statepoint-stack-usage.ll @@ -8,7 +8,7 @@ ; of GC arguments differ, niave lowering code would insert loads and ; stores to rearrange items on the stack. We need to make sure (for ; performance) that this doesn't happen. -define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { +define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" { ; CHECK-LABEL: back_to_back_calls ; The exact stores don't matter, but there need to be three stack slots created ; CHECK: movq %rdi, 16(%rsp) @@ -31,7 +31,7 @@ ; This test simply checks that minor changes in vm state don't prevent slots ; being reused for gc values. -define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { +define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" { ; CHECK-LABEL: reserve_first ; The exact stores don't matter, but there need to be three stack slots created ; CHECK: movq %rdi, 16(%rsp) @@ -53,7 +53,7 @@ } ; Test that stack slots are reused for invokes -define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { +define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" { ; CHECK-LABEL: back_to_back_invokes entry: ; The exact stores don't matter, but there need to be three stack slots created @@ -93,12 +93,89 @@ ret i32 0 } +; A simple forwarding case where the same value is used across two +; safepoints. We'll build on this pattern for other tests. +define i32 addrspace(1)* @test3(i32 addrspace(1)* %a) gc "statepoint-example" { +; CHECK-LABEL: test3 +; CHECK: movq %rdi, (%rsp) +; CHECK-NEXT: callq +; CHECK-NOT: movq +; CHECK: callq +; CHECK: movq (%rsp), %rax + + %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a) + %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7) + %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a1) + %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 7, i32 7) + ret i32 addrspace(1)* %a2 +} + +; A bitcast does not change the value stored, we can record the same stack slot +; for both a bit cast and it's source value. We'll end up with one stack slot, +; but two locations in the stackmap. +define i32 addrspace(1)* @test4(i32 addrspace(1)* %a) gc "statepoint-example" { +; CHECK-LABEL: test4 +; CHECK: movq %rdi, (%rsp) +; CHECK-NEXT: callq +; CHECK-NOT: movq +; CHECK: callq +; CHECK: movq (%rsp), %rax + + %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a) + %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7) + %a1_cast = bitcast i32 addrspace(1)* %a1 to i64 addrspace(1)* + %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a1, i64 addrspace(1)* %a1_cast) + %unused = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 7, i32 7) + %a2 = tail call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %safepoint_token2, i32 8, i32 8) + %a2_cast = bitcast i64 addrspace(1)* %a2 to i32 addrspace(1)* + ret i32 addrspace(1)* %a2_cast +} + +; For certain RMW idioms between statepoints, we can reuse the original stack +; slot and update the memory slot directly. This is obviously profitable if +; the original value is only live for the update. +define i32 addrspace(1)* @test5(i32 addrspace(1)* %a) gc "statepoint-example" { +; CHECK-LABEL: test5 +; CHECK: movq %rdi, (%rsp) +; CHECK-NEXT: callq +; CHECK-NOT: movq +; CHECK: addq $32, (%rsp) +; CHECK-NEXT: callq +; CHECK: movq (%rsp), %rax + + %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a) + %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7) + %a1_derived = getelementptr i32, i32 addrspace(1)* %a1, i64 8 + %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a1_derived) + %relocated = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 7, i32 7) + ret i32 addrspace(1)* %relocated +} + +; Same as test5, but with an integer add operation on deopt state rather than +; a GEP on a GC pointer. Note that we don't get update in place at the moment. +define void @test6(i32 %a) gc "statepoint-example" { +; CHECK-LABEL: test6 +; CHECK: movl %edi, %ebx +; CHECK-NEXT: movl %ebx, 12(%rsp) +; CHECK-NEXT: callq +; CHECK-NOT: movq +; CHECK: incl %ebx +; CHECK-NEXT: movl %ebx, 12(%rsp) +; CHECK-NEXT: callq + + call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 1, i32 %a) + %a1_add = add i32 %a, 1 + tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 1, i32 %a1_add) + ret void +} + + ; Function Attrs: nounwind declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3 +declare i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32, i32, i32) #3 declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32, i32, i32) #3 declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) declare i32 @"personality_function"() -attributes #1 = { uwtable }