Index: lib/CodeGen/PeepholeOptimizer.cpp
===================================================================
--- lib/CodeGen/PeepholeOptimizer.cpp
+++ lib/CodeGen/PeepholeOptimizer.cpp
@@ -1324,6 +1324,7 @@
   if (!MI->getOperand(0).getSubReg() &&
       TargetRegisterInfo::isVirtualRegister(Reg) &&
       MRI->hasOneNonDBGUse(Reg)) {
+    //dbgs() << Reg << "might be forldable\n";
     FoldAsLoadDefCandidates.insert(Reg);
     return true;
   }
@@ -1540,11 +1541,6 @@
       if (MI->isDebugValue())
           continue;
 
-      // If we run into an instruction we can't fold across, discard
-      // the load candidates.
-      if (MI->isLoadFoldBarrier())
-        FoldAsLoadDefCandidates.clear();
-
       if (MI->isPosition() || MI->isPHI())
         continue;
 
@@ -1588,7 +1584,8 @@
         DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
                      << '\n');
         NAPhysToVirtMIs.clear();
-        continue;
+        // FOR DISCUSSION - Is it legal to delete this continue?  
+        //        continue;
       }
 
       if ((isUncoalescableCopy(*MI) &&
@@ -1639,41 +1636,69 @@
       // earlier load into MI.
       if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
           !FoldAsLoadDefCandidates.empty()) {
-        const MCInstrDesc &MIDesc = MI->getDesc();
-        for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands();
-             ++i) {
-          const MachineOperand &MOp = MI->getOperand(i);
-          if (!MOp.isReg())
-            continue;
-          unsigned FoldAsLoadDefReg = MOp.getReg();
-          if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) {
-            // We need to fold load after optimizeCmpInstr, since
-            // optimizeCmpInstr can enable folding by converting SUB to CMP.
-            // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
-            // we need it for markUsesInDebugValueAsUndef().
-            unsigned FoldedReg = FoldAsLoadDefReg;
-            MachineInstr *DefMI = nullptr;
-            if (MachineInstr *FoldMI =
-                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
-              // Update LocalMIs since we replaced MI with FoldMI and deleted
-              // DefMI.
-              DEBUG(dbgs() << "Replacing: " << *MI);
-              DEBUG(dbgs() << "     With: " << *FoldMI);
-              LocalMIs.erase(MI);
-              LocalMIs.erase(DefMI);
-              LocalMIs.insert(FoldMI);
-              MI->eraseFromParent();
-              DefMI->eraseFromParent();
-              MRI->markUsesInDebugValueAsUndef(FoldedReg);
-              FoldAsLoadDefCandidates.erase(FoldedReg);
-              ++NumLoadFold;
-              // MI is replaced with FoldMI.
-              Changed = true;
-              break;
+
+        auto tryOptimizeLoad = [&]() {
+          const MCInstrDesc &MIDesc = MI->getDesc();
+          for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands();
+               ++i) {
+            //dbgs() << i << "\n";
+            const MachineOperand &MOp = MI->getOperand(i);
+            //dbgs() << MOp << "\n";
+            if (!MOp.isReg())
+              continue;
+            unsigned FoldAsLoadDefReg = MOp.getReg();
+            //dbgs() << "Candidate: " << FoldAsLoadDefReg << "\n";
+            if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) {
+              //dbgs() << "Found candidate to fold\n";
+              // We need to fold load after optimizeCmpInstr, since
+              // optimizeCmpInstr can enable folding by converting SUB to CMP.
+              // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
+              // we need it for markUsesInDebugValueAsUndef().
+              unsigned FoldedReg = FoldAsLoadDefReg;
+              MachineInstr *DefMI = nullptr;
+              if (MachineInstr *FoldMI =
+                  TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
+                // Update LocalMIs since we replaced MI with FoldMI and deleted
+                // DefMI.
+                DEBUG(dbgs() << "Replacing: " << *MI);
+                DEBUG(dbgs() << "     With: " << *FoldMI);
+                LocalMIs.erase(MI);
+                LocalMIs.erase(DefMI);
+                LocalMIs.insert(FoldMI);
+                MI->eraseFromParent();
+                DefMI->eraseFromParent();
+                MRI->markUsesInDebugValueAsUndef(FoldedReg);
+                FoldAsLoadDefCandidates.erase(FoldedReg);
+                ++NumLoadFold;
+                return FoldMI;
+              }
             }
           }
+          return (MachineInstr*)nullptr;
+        };
+
+        // Keep trying to optimize the load until we can't fold any further.
+        // FOR DISCUSSION
+        // Note: This is O(n^2) in the number of operands if
+        // TII->optimizeLoadInstr were to fail for every other operand.
+        while(true) {
+          auto *FoldMI = tryOptimizeLoad();
+          if (!FoldMI)
+            break; // No folding occurred
+          
+          // MI is replaced with FoldMI.
+          Changed = true;
+          MI = FoldMI;
         }
       }
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.  Note: We might be able to fold *into* this
+      // instruction, so this needs to be after the folding logic.
+      if (MI->isLoadFoldBarrier()) {
+        DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n");
+        FoldAsLoadDefCandidates.clear();
+      }
+
     }
   }
 
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -5331,13 +5331,21 @@
                                               const MachineRegisterInfo *MRI,
                                               unsigned &FoldAsLoadDefReg,
                                               MachineInstr *&DefMI) const {
+  // FOR DISCUSSION - Is this left over from when we had a single variable as
+  // opposed to a set of FoldAsLoadDefs?  Can we remove?
   if (FoldAsLoadDefReg == 0)
     return nullptr;
+
+  // FOR DISCUSSION - Do we need this code at all?  If so, why?  Shouldn't
+  // foldMemoryOperand be robust against this case?
+#if 0
   // To be conservative, if there exists another load, clear the load candidate.
-  if (MI.mayLoad()) {
+  if (MI.mayLoad() &&
+      MI.getOpcode() != TargetOpcode::STATEPOINT) {
     FoldAsLoadDefReg = 0;
     return nullptr;
   }
+#endif
 
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
@@ -5347,27 +5355,24 @@
     return nullptr;
 
   // Collect information about virtual register operands of MI.
-  unsigned SrcOperandId = 0;
-  bool FoundSrcOperand = false;
-  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+  SmallVector<unsigned, 1> SrcOperandIds;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (Reg != FoldAsLoadDefReg)
       continue;
-    // Do not fold if we have a subreg use or a def or multiple uses.
-    if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+    // Do not fold if we have a subreg use or a def.
+    if (MO.getSubReg() || MO.isDef())
       return nullptr;
-
-    SrcOperandId = i;
-    FoundSrcOperand = true;
+    SrcOperandIds.push_back(i);
   }
-  if (!FoundSrcOperand)
+  if (SrcOperandIds.empty())
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
Index: test/CodeGen/X86/statepoint-live-in.ll
===================================================================
--- test/CodeGen/X86/statepoint-live-in.ll
+++ test/CodeGen/X86/statepoint-live-in.ll
@@ -34,35 +34,24 @@
 entry:
 ; TODO: We should have folded the reload into the statepoint.
 ; CHECK-LABEL: @test3
-; CHECK:       	movl	32(%rsp), %r10d
-; CHECK-NEXT: 	movl	24(%rsp), %r11d
-; CHECK-NEXT:   movl	16(%rsp), %eax
+; CHECK:       	pushq %rax
+; CHECK-NEXT: 	Ltmp9:
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
 ; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
   ret void
 }
 
 ; This case just confirms that we don't crash when given more live values
-; than registers.  This is a case where we *have* to use a stack slot.
+; than registers.  This is a case where we *have* to use a stack slot.  This
+; also ends up being a good test of whether we can fold loads from immutable
+; stack slots into the statepoint.
 define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
 entry:
-; TODO: We should have folded the reload into the statepoint.  
 ; CHECK-LABEL: test4
-; CHECK: pushq %r15
-; CHECK: pushq %r14
-; CHECK: pushq %r13
-; CHECK: pushq %r12
-; CHECK: pushq %rbx
-; CHECK: pushq %rax
-; CHECK:       	movl	128(%rsp), %r13d
-; CHECK-NEXT:   movl	120(%rsp), %r12d
-; CHECK-NEXT:   movl	112(%rsp), %r15d
-; CHECK-NEXT:   movl	104(%rsp), %r14d
-; CHECK-NEXT:   movl	96(%rsp), %ebp
-; CHECK-NEXT:   movl	88(%rsp), %ebx
-; CHECK-NEXT:   movl	80(%rsp), %r11d
-; CHECK-NEXT:   movl	72(%rsp), %r10d
-; CHECK-NEXT:   movl	64(%rsp), %eax
+; CHECK:        pushq %rax
+; CHECK-NEXT: 	Ltmp11:
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
 ; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
   ret void
@@ -90,7 +79,7 @@
 ; CHECK:        movl %edi, %ebx
 ; CHECK:        movl %ebx, 12(%rsp)
 ; CHECK-NEXT:   callq	_baz
-; CHECK-NEXT:  Ltmp30:
+; CHECK-NEXT:  Ltmp
 ; CHECK-NEXT:   callq	_bar
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)