Index: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
+++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1540,11 +1540,6 @@
       if (MI->isDebugValue())
           continue;
 
-      // If we run into an instruction we can't fold across, discard
-      // the load candidates.
-      if (MI->isLoadFoldBarrier())
-        FoldAsLoadDefCandidates.clear();
-
       if (MI->isPosition() || MI->isPHI())
         continue;
 
@@ -1588,7 +1583,6 @@
         DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
                      << '\n');
         NAPhysToVirtMIs.clear();
-        continue;
       }
 
       if ((isUncoalescableCopy(*MI) &&
@@ -1639,8 +1633,14 @@
       // earlier load into MI.
       if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
           !FoldAsLoadDefCandidates.empty()) {
+
+        // We visit each operand even after successfully folding a previous
+        // one.  This allows us to fold multiple loads into a single
+        // instruction.  We do assume that optimizeLoadInstr doesn't insert
+        // foldable uses earlier in the argument list.  Since we don't restart
+        // iteration, we'd miss such cases.
         const MCInstrDesc &MIDesc = MI->getDesc();
-        for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands();
+        for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands();
              ++i) {
           const MachineOperand &MOp = MI->getOperand(i);
           if (!MOp.isReg())
@@ -1667,13 +1667,23 @@
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              // MI is replaced with FoldMI.
+              
+              // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
-              break;
+              MI = FoldMI;
             }
           }
         }
       }
+      
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.  Note: We might be able to fold *into* this
+      // instruction, so this needs to be after the folding logic.
+      if (MI->isLoadFoldBarrier()) {
+        DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n");
+        FoldAsLoadDefCandidates.clear();
+      }
+
     }
   }
 
Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
@@ -6576,14 +6576,6 @@
                                               const MachineRegisterInfo *MRI,
                                               unsigned &FoldAsLoadDefReg,
                                               MachineInstr *&DefMI) const {
-  if (FoldAsLoadDefReg == 0)
-    return nullptr;
-  // To be conservative, if there exists another load, clear the load candidate.
-  if (MI.mayLoad()) {
-    FoldAsLoadDefReg = 0;
-    return nullptr;
-  }
-
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
@@ -6592,27 +6584,24 @@
     return nullptr;
 
   // Collect information about virtual register operands of MI.
-  unsigned SrcOperandId = 0;
-  bool FoundSrcOperand = false;
-  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+  SmallVector<unsigned, 1> SrcOperandIds;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (Reg != FoldAsLoadDefReg)
       continue;
-    // Do not fold if we have a subreg use or a def or multiple uses.
-    if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+    // Do not fold if we have a subreg use or a def.
+    if (MO.getSubReg() || MO.isDef())
       return nullptr;
-
-    SrcOperandId = i;
-    FoundSrcOperand = true;
+    SrcOperandIds.push_back(i);
   }
-  if (!FoundSrcOperand)
+  if (SrcOperandIds.empty())
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
Index: llvm/trunk/test/CodeGen/X86/anyregcc.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/anyregcc.ll
+++ llvm/trunk/test/CodeGen/X86/anyregcc.ll
@@ -34,7 +34,7 @@
 ; CHECK-NEXT:   .quad 56
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _anyreg_test2
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _patchpoint_spilldef
 ; CHECK-NEXT:   .quad 56
@@ -272,31 +272,31 @@
 ; CHECK-NEXT:   .byte 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
+; Loc 9: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 10: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 11: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 12: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 13: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
Index: llvm/trunk/test/CodeGen/X86/stackmap.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/stackmap.ll
+++ llvm/trunk/test/CodeGen/X86/stackmap.ll
@@ -38,10 +38,10 @@
 ; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _spilledValue
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _spilledStackMapValue
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _spillSubReg
 ; CHECK-NEXT:   .quad 56
Index: llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll
+++ llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll
@@ -34,35 +34,24 @@
 entry:
 ; TODO: We should have folded the reload into the statepoint.
 ; CHECK-LABEL: @test3
-; CHECK:       	movl	32(%rsp), %r10d
-; CHECK-NEXT: 	movl	24(%rsp), %r11d
-; CHECK-NEXT:   movl	16(%rsp), %eax
+; CHECK:       	pushq %rax
+; CHECK-NEXT: 	Lcfi
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
 ; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
   ret void
 }
 
 ; This case just confirms that we don't crash when given more live values
-; than registers.  This is a case where we *have* to use a stack slot.
+; than registers.  This is a case where we *have* to use a stack slot.  This
+; also ends up being a good test of whether we can fold loads from immutable
+; stack slots into the statepoint.
 define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
 entry:
-; TODO: We should have folded the reload into the statepoint.  
 ; CHECK-LABEL: test4
-; CHECK: pushq %r15
-; CHECK: pushq %r14
-; CHECK: pushq %r13
-; CHECK: pushq %r12
-; CHECK: pushq %rbx
-; CHECK: pushq %rax
-; CHECK:       	movl	128(%rsp), %r13d
-; CHECK-NEXT:   movl	120(%rsp), %r12d
-; CHECK-NEXT:   movl	112(%rsp), %r15d
-; CHECK-NEXT:   movl	104(%rsp), %r14d
-; CHECK-NEXT:   movl	96(%rsp), %ebp
-; CHECK-NEXT:   movl	88(%rsp), %ebx
-; CHECK-NEXT:   movl	80(%rsp), %r11d
-; CHECK-NEXT:   movl	72(%rsp), %r10d
-; CHECK-NEXT:   movl	64(%rsp), %eax
+; CHECK:        pushq %rax
+; CHECK-NEXT: 	Lcfi
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
 ; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
   ret void
@@ -90,7 +79,7 @@
 ; CHECK:        movl %edi, %ebx
 ; CHECK:        movl %ebx, 12(%rsp)
 ; CHECK-NEXT:   callq	_baz
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp
 ; CHECK-NEXT:   callq	_bar
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)