Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -1324,6 +1324,7 @@ if (!MI->getOperand(0).getSubReg() && TargetRegisterInfo::isVirtualRegister(Reg) && MRI->hasOneNonDBGUse(Reg)) { + //dbgs() << Reg << "might be forldable\n"; FoldAsLoadDefCandidates.insert(Reg); return true; } @@ -1540,11 +1541,6 @@ if (MI->isDebugValue()) continue; - // If we run into an instruction we can't fold across, discard - // the load candidates. - if (MI->isLoadFoldBarrier()) - FoldAsLoadDefCandidates.clear(); - if (MI->isPosition() || MI->isPHI()) continue; @@ -1588,7 +1584,8 @@ DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI << '\n'); NAPhysToVirtMIs.clear(); - continue; + // FOR DISCUSSION - Is it legal to delete this continue? + // continue; } if ((isUncoalescableCopy(*MI) && @@ -1639,41 +1636,69 @@ // earlier load into MI. if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) && !FoldAsLoadDefCandidates.empty()) { - const MCInstrDesc &MIDesc = MI->getDesc(); - for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands(); - ++i) { - const MachineOperand &MOp = MI->getOperand(i); - if (!MOp.isReg()) - continue; - unsigned FoldAsLoadDefReg = MOp.getReg(); - if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) { - // We need to fold load after optimizeCmpInstr, since - // optimizeCmpInstr can enable folding by converting SUB to CMP. - // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and - // we need it for markUsesInDebugValueAsUndef(). - unsigned FoldedReg = FoldAsLoadDefReg; - MachineInstr *DefMI = nullptr; - if (MachineInstr *FoldMI = - TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) { - // Update LocalMIs since we replaced MI with FoldMI and deleted - // DefMI. - DEBUG(dbgs() << "Replacing: " << *MI); - DEBUG(dbgs() << " With: " << *FoldMI); - LocalMIs.erase(MI); - LocalMIs.erase(DefMI); - LocalMIs.insert(FoldMI); - MI->eraseFromParent(); - DefMI->eraseFromParent(); - MRI->markUsesInDebugValueAsUndef(FoldedReg); - FoldAsLoadDefCandidates.erase(FoldedReg); - ++NumLoadFold; - // MI is replaced with FoldMI. - Changed = true; - break; + + auto tryOptimizeLoad = [&]() { + const MCInstrDesc &MIDesc = MI->getDesc(); + for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands(); + ++i) { + //dbgs() << i << "\n"; + const MachineOperand &MOp = MI->getOperand(i); + //dbgs() << MOp << "\n"; + if (!MOp.isReg()) + continue; + unsigned FoldAsLoadDefReg = MOp.getReg(); + //dbgs() << "Candidate: " << FoldAsLoadDefReg << "\n"; + if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) { + //dbgs() << "Found candidate to fold\n"; + // We need to fold load after optimizeCmpInstr, since + // optimizeCmpInstr can enable folding by converting SUB to CMP. + // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and + // we need it for markUsesInDebugValueAsUndef(). + unsigned FoldedReg = FoldAsLoadDefReg; + MachineInstr *DefMI = nullptr; + if (MachineInstr *FoldMI = + TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) { + // Update LocalMIs since we replaced MI with FoldMI and deleted + // DefMI. + DEBUG(dbgs() << "Replacing: " << *MI); + DEBUG(dbgs() << " With: " << *FoldMI); + LocalMIs.erase(MI); + LocalMIs.erase(DefMI); + LocalMIs.insert(FoldMI); + MI->eraseFromParent(); + DefMI->eraseFromParent(); + MRI->markUsesInDebugValueAsUndef(FoldedReg); + FoldAsLoadDefCandidates.erase(FoldedReg); + ++NumLoadFold; + return FoldMI; + } } } + return (MachineInstr*)nullptr; + }; + + // Keep trying to optimize the load until we can't fold any further. + // FOR DISCUSSION + // Note: This is O(n^2) in the number of operands if + // TII->optimizeLoadInstr were to fail for every other operand. + while(true) { + auto *FoldMI = tryOptimizeLoad(); + if (!FoldMI) + break; // No folding occurred + + // MI is replaced with FoldMI. + Changed = true; + MI = FoldMI; } } + // If we run into an instruction we can't fold across, discard + // the load candidates. Note: We might be able to fold *into* this + // instruction, so this needs to be after the folding logic. + if (MI->isLoadFoldBarrier()) { + DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n"); + FoldAsLoadDefCandidates.clear(); + } + } } Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -5331,13 +5331,21 @@ const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { + // FOR DISCUSSION - Is this left over from when we had a single variable as + // opposed to a set of FoldAsLoadDefs? Can we remove? if (FoldAsLoadDefReg == 0) return nullptr; + + // FOR DISCUSSION - Do we need this code at all? If so, why? Shouldn't + // foldMemoryOperand be robust against this case? +#if 0 // To be conservative, if there exists another load, clear the load candidate. - if (MI.mayLoad()) { + if (MI.mayLoad() && + MI.getOpcode() != TargetOpcode::STATEPOINT) { FoldAsLoadDefReg = 0; return nullptr; } +#endif // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); @@ -5347,27 +5355,24 @@ return nullptr; // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { + SmallVector SrcOperandIds; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + // Do not fold if we have a subreg use or a def. + if (MO.getSubReg() || MO.isDef()) return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; + SrcOperandIds.push_back(i); } - if (!FoundSrcOperand) + if (SrcOperandIds.empty()) return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } Index: test/CodeGen/X86/statepoint-live-in.ll =================================================================== --- test/CodeGen/X86/statepoint-live-in.ll +++ test/CodeGen/X86/statepoint-live-in.ll @@ -34,35 +34,24 @@ entry: ; TODO: We should have folded the reload into the statepoint. ; CHECK-LABEL: @test3 -; CHECK: movl 32(%rsp), %r10d -; CHECK-NEXT: movl 24(%rsp), %r11d -; CHECK-NEXT: movl 16(%rsp), %eax +; CHECK: pushq %rax +; CHECK-NEXT: Ltmp9: +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq _bar %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i) ret void } ; This case just confirms that we don't crash when given more live values -; than registers. This is a case where we *have* to use a stack slot. +; than registers. This is a case where we *have* to use a stack slot. This +; also ends up being a good test of whether we can fold loads from immutable +; stack slots into the statepoint. define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" { entry: -; TODO: We should have folded the reload into the statepoint. ; CHECK-LABEL: test4 -; CHECK: pushq %r15 -; CHECK: pushq %r14 -; CHECK: pushq %r13 -; CHECK: pushq %r12 -; CHECK: pushq %rbx -; CHECK: pushq %rax -; CHECK: movl 128(%rsp), %r13d -; CHECK-NEXT: movl 120(%rsp), %r12d -; CHECK-NEXT: movl 112(%rsp), %r15d -; CHECK-NEXT: movl 104(%rsp), %r14d -; CHECK-NEXT: movl 96(%rsp), %ebp -; CHECK-NEXT: movl 88(%rsp), %ebx -; CHECK-NEXT: movl 80(%rsp), %r11d -; CHECK-NEXT: movl 72(%rsp), %r10d -; CHECK-NEXT: movl 64(%rsp), %eax +; CHECK: pushq %rax +; CHECK-NEXT: Ltmp11: +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq _bar %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) ret void @@ -90,7 +79,7 @@ ; CHECK: movl %edi, %ebx ; CHECK: movl %ebx, 12(%rsp) ; CHECK-NEXT: callq _baz -; CHECK-NEXT: Ltmp30: +; CHECK-NEXT: Ltmp ; CHECK-NEXT: callq _bar call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a) call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)