Index: llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -603,7 +603,7 @@ // Clear state StatepointLowering.startNewStatepoint(*this); assert(SI.Bases.size() == SI.Ptrs.size() && - SI.Ptrs.size() == SI.GCRelocates.size()); + SI.Ptrs.size() <= SI.GCRelocates.size()); #ifndef NDEBUG for (auto *Reloc : SI.GCRelocates) @@ -823,10 +823,29 @@ ISP.getNumCallArgs(), ActualCallee, ISP.getActualReturnType(), false /* IsPatchPoint */); + // There may be duplication in the gc.relocate list; such as two copies of + // each relocation on normal and exceptional path for an invoke. We only + // need to spill once and record one copy in the stackmap, but we need to + // reload once per gc.relocate. (Dedupping gc.relocates is trickier and best + // handled as a CSE problem elsewhere.) + // TODO: There a couple of major stackmap size optimizations we could do + // here if we wished. + // 1) If we've encountered a derived pair {B, D}, we don't need to actually + // record {B,B} if it's seen later. + // 2) Due to rematerialization, actual derived pointers are somewhat rare; + // given that, we could change the format to record base pointer relocations + // separately with half the space. This would require a format rev and a + // fairly major rework of the STATEPOINT node though. + SmallSet Seen; for (const GCRelocateInst *Relocate : ISP.getRelocates()) { SI.GCRelocates.push_back(Relocate); - SI.Bases.push_back(Relocate->getBasePtr()); - SI.Ptrs.push_back(Relocate->getDerivedPtr()); + + SDValue BaseSD = getValue(Relocate->getBasePtr()); + SDValue DerivedSD = getValue(Relocate->getDerivedPtr()); + if (Seen.insert(DerivedSD).second) { + SI.Bases.push_back(Relocate->getBasePtr()); + SI.Ptrs.push_back(Relocate->getDerivedPtr()); + } } SI.GCArgs = ArrayRef(ISP.gc_args_begin(), ISP.gc_args_end()); @@ -973,10 +992,13 @@ unsigned Index = *DerivedPtrLocation; SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy()); - // Note: We know all of these reloads are independent, but don't bother to - // exploit that chain wise. DAGCombine will happily do so as needed, so - // doing it here would be a small compile time win at most. - SDValue Chain = getRoot(); + // All the reloads are independent and are reading memory only modified by + // statepoints (i.e. no other aliasing stores); informing SelectionDAG of + // this this let's CSE kick in for free and allows reordering of instructions + // if possible. The lowering for statepoint sets the root, so this is + // ordering all reloads with the either a) the statepoint node itself, or b) + // the entry of the current block for an invoke statepoint. + const SDValue Chain = DAG.getRoot(); // != Builder.getRoot() auto &MF = DAG.getMachineFunction(); auto &MFI = MF.getFrameInfo(); @@ -991,8 +1013,7 @@ SDValue SpillLoad = DAG.getLoad(LoadVT, getCurSDLoc(), Chain, SpillSlot, LoadMMO); - - DAG.setRoot(SpillLoad.getValue(1)); + PendingLoads.push_back(SpillLoad.getValue(1)); assert(SpillLoad.getNode()); setValue(&Relocate, SpillLoad); Index: llvm/test/CodeGen/X86/statepoint-duplicates-export.ll =================================================================== --- llvm/test/CodeGen/X86/statepoint-duplicates-export.ll +++ llvm/test/CodeGen/X86/statepoint-duplicates-export.ll @@ -44,19 +44,16 @@ define i1 @test2(i32 addrspace(1)* %arg) gc "statepoint-example" { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movq %rdi, (%rsp) ; CHECK-NEXT: callq func ; CHECK-NEXT: .Ltmp2: -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: callq func ; CHECK-NEXT: .Ltmp3: -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: orq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: cmpq $0, (%rsp) ; CHECK-NEXT: sete %al -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: Index: llvm/test/CodeGen/X86/statepoint-stackmap-size.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/statepoint-stackmap-size.ll @@ -0,0 +1,22 @@ +; RUN: llc -verify-machineinstrs < %s | fgrep -A 10000 .llvm_stackmaps | wc -l | FileCheck %s + +; Without removal of duplicate entries, the size is 62 lines +; CHECK: 50 + +target triple = "x86_64-pc-linux-gnu" + +declare void @func() + +define i1 @test1(i32 addrspace(1)* %arg) gc "statepoint-example" { +entry: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %arg) + %reloc1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7) + %reloc2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7) + %cmp1 = icmp eq i32 addrspace(1)* %reloc1, null + %cmp2 = icmp eq i32 addrspace(1)* %reloc2, null + %cmp = and i1 %cmp1, %cmp2 + ret i1 %cmp +} + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)