Index: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp @@ -1540,11 +1540,6 @@ if (MI->isDebugValue()) continue; - // If we run into an instruction we can't fold across, discard - // the load candidates. - if (MI->isLoadFoldBarrier()) - FoldAsLoadDefCandidates.clear(); - if (MI->isPosition() || MI->isPHI()) continue; @@ -1588,7 +1583,6 @@ DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI << '\n'); NAPhysToVirtMIs.clear(); - continue; } if ((isUncoalescableCopy(*MI) && @@ -1639,8 +1633,14 @@ // earlier load into MI. if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) && !FoldAsLoadDefCandidates.empty()) { + + // We visit each operand even after successfully folding a previous + // one. This allows us to fold multiple loads into a single + // instruction. We do assume that optimizeLoadInstr doesn't insert + // foldable uses earlier in the argument list. Since we don't restart + // iteration, we'd miss such cases. const MCInstrDesc &MIDesc = MI->getDesc(); - for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands(); + for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands(); ++i) { const MachineOperand &MOp = MI->getOperand(i); if (!MOp.isReg()) @@ -1667,13 +1667,23 @@ MRI->markUsesInDebugValueAsUndef(FoldedReg); FoldAsLoadDefCandidates.erase(FoldedReg); ++NumLoadFold; - // MI is replaced with FoldMI. + + // MI is replaced with FoldMI so we can continue trying to fold Changed = true; - break; + MI = FoldMI; } } } } + + // If we run into an instruction we can't fold across, discard + // the load candidates. Note: We might be able to fold *into* this + // instruction, so this needs to be after the folding logic. + if (MI->isLoadFoldBarrier()) { + DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n"); + FoldAsLoadDefCandidates.clear(); + } + } } Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -6576,14 +6576,6 @@ const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { - if (FoldAsLoadDefReg == 0) - return nullptr; - // To be conservative, if there exists another load, clear the load candidate. - if (MI.mayLoad()) { - FoldAsLoadDefReg = 0; - return nullptr; - } - // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); @@ -6592,27 +6584,24 @@ return nullptr; // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { + SmallVector SrcOperandIds; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + // Do not fold if we have a subreg use or a def. + if (MO.getSubReg() || MO.isDef()) return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; + SrcOperandIds.push_back(i); } - if (!FoundSrcOperand) + if (SrcOperandIds.empty()) return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } Index: llvm/trunk/test/CodeGen/X86/anyregcc.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/anyregcc.ll +++ llvm/trunk/test/CodeGen/X86/anyregcc.ll @@ -34,7 +34,7 @@ ; CHECK-NEXT: .quad 56 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad _anyreg_test2 -; CHECK-NEXT: .quad 56 +; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad _patchpoint_spilldef ; CHECK-NEXT: .quad 56 @@ -272,31 +272,31 @@ ; CHECK-NEXT: .byte 8 ; CHECK-NEXT: .short {{[0-9]+}} ; CHECK-NEXT: .long 0 -; Loc 9: Register -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .short {{[0-9]+}} -; CHECK-NEXT: .long 0 -; Loc 10: Register -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .short {{[0-9]+}} -; CHECK-NEXT: .long 0 -; Loc 11: Register -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .short {{[0-9]+}} -; CHECK-NEXT: .long 0 -; Loc 12: Register -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .short {{[0-9]+}} -; CHECK-NEXT: .long 0 -; Loc 13: Register -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .short {{[0-9]+}} -; CHECK-NEXT: .long 0 +; Loc 9: Argument, still on stack +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 6 +; CHECK-NEXT: .long +; Loc 10: Argument, still on stack +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 6 +; CHECK-NEXT: .long +; Loc 11: Argument, still on stack +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 6 +; CHECK-NEXT: .long +; Loc 12: Argument, still on stack +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 6 +; CHECK-NEXT: .long +; Loc 13: Argument, still on stack +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 6 +; CHECK-NEXT: .long define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable { entry: %f = inttoptr i64 12297829382473034410 to i8* Index: llvm/trunk/test/CodeGen/X86/stackmap.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stackmap.ll +++ llvm/trunk/test/CodeGen/X86/stackmap.ll @@ -38,10 +38,10 @@ ; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad _spilledValue -; CHECK-NEXT: .quad 56 +; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad _spilledStackMapValue -; CHECK-NEXT: .quad 56 +; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad _spillSubReg ; CHECK-NEXT: .quad 56 Index: llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll +++ llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll @@ -34,35 +34,24 @@ entry: ; TODO: We should have folded the reload into the statepoint. ; CHECK-LABEL: @test3 -; CHECK: movl 32(%rsp), %r10d -; CHECK-NEXT: movl 24(%rsp), %r11d -; CHECK-NEXT: movl 16(%rsp), %eax +; CHECK: pushq %rax +; CHECK-NEXT: Lcfi +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq _bar %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i) ret void } ; This case just confirms that we don't crash when given more live values -; than registers. This is a case where we *have* to use a stack slot. +; than registers. This is a case where we *have* to use a stack slot. This +; also ends up being a good test of whether we can fold loads from immutable +; stack slots into the statepoint. define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" { entry: -; TODO: We should have folded the reload into the statepoint. ; CHECK-LABEL: test4 -; CHECK: pushq %r15 -; CHECK: pushq %r14 -; CHECK: pushq %r13 -; CHECK: pushq %r12 -; CHECK: pushq %rbx -; CHECK: pushq %rax -; CHECK: movl 128(%rsp), %r13d -; CHECK-NEXT: movl 120(%rsp), %r12d -; CHECK-NEXT: movl 112(%rsp), %r15d -; CHECK-NEXT: movl 104(%rsp), %r14d -; CHECK-NEXT: movl 96(%rsp), %ebp -; CHECK-NEXT: movl 88(%rsp), %ebx -; CHECK-NEXT: movl 80(%rsp), %r11d -; CHECK-NEXT: movl 72(%rsp), %r10d -; CHECK-NEXT: movl 64(%rsp), %eax +; CHECK: pushq %rax +; CHECK-NEXT: Lcfi +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq _bar %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) ret void @@ -90,7 +79,7 @@ ; CHECK: movl %edi, %ebx ; CHECK: movl %ebx, 12(%rsp) ; CHECK-NEXT: callq _baz -; CHECK-NEXT: Ltmp6: +; CHECK-NEXT: Ltmp ; CHECK-NEXT: callq _bar call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a) call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)