Index: include/llvm/CodeGen/MachineFunction.h
===================================================================
--- include/llvm/CodeGen/MachineFunction.h
+++ include/llvm/CodeGen/MachineFunction.h
@@ -287,6 +287,14 @@
   /// Should we be emitting segmented stack stuff for the function
   bool shouldSplitStack();
 
+  /// \brief Should we be probing the stack for the function.
+  ///
+  /// Probing the stack means that we must read or write to the stack on every
+  /// page. This is to ensure that a guard page will be hit and stack overflow
+  /// can be detected. We insert instructions to do this when allocating from
+  /// the stack.
+  bool shouldProbeStack() const;
+
   /// getNumBlockIDs - Return the number of MBB ID's allocated.
   ///
   unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); }
Index: lib/CodeGen/MachineFunction.cpp
===================================================================
--- lib/CodeGen/MachineFunction.cpp
+++ lib/CodeGen/MachineFunction.cpp
@@ -147,6 +147,10 @@
   return getFunction()->hasFnAttribute("split-stack");
 }
 
+bool MachineFunction::shouldProbeStack() const {
+  return getFunction()->hasFnAttribute("probe-stack");
+}
+
 /// This discards all of the MachineBasicBlock numbers and recomputes them.
 /// This guarantees that the MBB numbers are sequential, dense, and match the
 /// ordering of the blocks within the function.  If a specific MachineBasicBlock
Index: lib/CodeGen/PrologEpilogInserter.cpp
===================================================================
--- lib/CodeGen/PrologEpilogInserter.cpp
+++ lib/CodeGen/PrologEpilogInserter.cpp
@@ -773,6 +773,10 @@
   // Add prologue to the function...
   TFI.emitPrologue(Fn, *SaveBlock);
 
+  // RestoreBlocks can be clobbered by emitPrologue. Recalculate it.
+  RestoreBlocks.clear();
+  calculateSets(Fn);
+
   // Add epilogue to restore the callee-save registers in each exiting block.
   for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
     TFI.emitEpilogue(Fn, *RestoreBlock);
Index: lib/Target/X86/X86FrameLowering.h
===================================================================
--- lib/Target/X86/X86FrameLowering.h
+++ lib/Target/X86/X86FrameLowering.h
@@ -47,11 +47,26 @@
 
   unsigned StackPtr;
 
+  void pushRegForStackProbeCall(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                DebugLoc DL,
+                                bool &IsAlive,
+                                unsigned RegType,
+                                uint64_t &NumBytes) const;
+  void popRegForStackProbeCall(MachineFunction &MF,
+                               MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               DebugLoc DL,
+                               bool &IsAlive,
+                               unsigned RegType,
+                               uint64_t &NumBytes) const;
   /// Emit a call to the target's stack probe function. This is required for all
   /// large stack allocations on Windows. The caller is required to materialize
   /// the number of bytes to probe in RAX/EAX.
-  void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, DebugLoc DL) const;
+  MachineInstr *emitStackProbes(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                                bool InProlog) const;
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
@@ -59,7 +74,7 @@
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &InMBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void adjustForSegmentedStacks(MachineFunction &MF,
Index: lib/Target/X86/X86FrameLowering.cpp
===================================================================
--- lib/Target/X86/X86FrameLowering.cpp
+++ lib/Target/X86/X86FrameLowering.cpp
@@ -197,13 +197,14 @@
   return 0;
 }
 
-static bool isEAXLiveIn(MachineFunction &MF) {
+static bool isLiveIn(MachineFunction &MF, unsigned CheckReg) {
+  CheckReg = getX86SubSuperRegister(CheckReg, MVT::i32);
+
   for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
        EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
     unsigned Reg = II->first;
 
-    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
-        Reg == X86::AH || Reg == X86::AL)
+    if (getX86SubSuperRegisterOrZero(Reg, MVT::i32) == CheckReg)
       return true;
   }
 
@@ -250,7 +251,7 @@
       // load the offset into a register and do one sub/add
       unsigned Reg = 0;
 
-      if (isSub && !isEAXLiveIn(*MBB.getParent()))
+      if (isSub && !isLiveIn(*MBB.getParent(), X86::EAX))
         Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
       else
         Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
@@ -425,60 +426,198 @@
   return false;
 }
 
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
-                                          MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL) const {
-  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+void X86FrameLowering::pushRegForStackProbeCall(MachineFunction &MF,
+                                                MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator MBBI,
+                                                DebugLoc DL,
+                                                bool &IsAlive,
+                                                unsigned RegType,
+                                                uint64_t &NumBytes) const {
+  IsAlive = isLiveIn(MF, RegType);
 
-  unsigned CallOp;
-  if (Is64Bit)
-    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
-  else
-    CallOp = X86::CALLpcrel32;
+  if (!IsAlive) {
+    return;
+  }
 
-  const char *Symbol;
-  if (Is64Bit) {
-    if (STI.isTargetCygMing()) {
-      Symbol = "___chkstk_ms";
-    } else {
-      Symbol = "__chkstk";
-    }
-  } else if (STI.isTargetCygMing())
-    Symbol = "_alloca";
-  else
-    Symbol = "_chkstk";
+  auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32);
 
-  MachineInstrBuilder CI;
+  // Save the register on the stack.
+  BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(Reg, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
 
-  // All current stack probes take AX and SP as input, clobber flags, and
-  // preserve all registers. x86_64 probes leave RSP unmodified.
-  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
-    // For the large code model, we have to call through a register. Use R11,
-    // as it is scratch in all supported calling conventions.
-    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
-        .addExternalSymbol(Symbol);
-    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+  // Reuse the space from the spill as a stack allocation.
+  NumBytes -= SlotSize;
+}
+
+void X86FrameLowering::popRegForStackProbeCall(MachineFunction &MF,
+                                                MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator MBBI,
+                                                DebugLoc DL,
+                                                bool &IsAlive,
+                                                unsigned RegType,
+                                                uint64_t &NumBytes) const {
+  if (!IsAlive) {
+    return;
+  }
+
+  // Restore the register from the stack slot.
+
+  auto Reg = getX86SubSuperRegister(RegType, Is64Bit ? MVT::i64 : MVT::i32);
+
+  auto MIB = BuildMI(MF, DL,
+                     TII.get(Is64Bit ? X86::MOV64rm : X86::MOV32rm),
+                     Reg);
+  MachineInstr *MI = addRegOffset(MIB, StackPtr, false, NumBytes);
+  MI->setFlag(MachineInstr::FrameSetup);
+  MBB.insert(MBBI, MI);
+
+  NumBytes += SlotSize;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbes(MachineFunction &MF,
+                                                MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator MBBI,
+                                                DebugLoc DL,
+                                                bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  // RAX contains the number of bytes of desired stack adjustment.
+  // The handling here assumes this value has already been updated so as to
+  // maintain stack alignment.
+  //
+  // We need to exit with RSP modified by this amount and execute suitable
+  // page touches to notify the OS that we're growing the stack responsibly.
+  // All stack probing must be done without modifying RSP.
+
+  // Set up the new basic blocks
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  MF.insert(MBBIter, LoopMBB);
+  MF.insert(MBBIter, ContinueMBB);
+
+  // Split MBB and move the tail portion down to ContinueMBB.
+  MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+  ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+  ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  const int64_t GuardSize = 0x1000;
+
+  auto RType = Is64Bit ? MVT::i64 : MVT::i32;
+
+  auto InputReg = getX86SubSuperRegister(X86::RAX, RType);
+
+  // Registers we need. For the normal case we use virtual
+  // registers. For the prolog expansion we use RAX, RCX and RDX.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = Is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned
+      SizeReg = InProlog ? InputReg :
+                           MRI.createVirtualRegister(RegClass),
+      StackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) :
+                            MRI.createVirtualRegister(RegClass),
+      CountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) :
+                            MRI.createVirtualRegister(RegClass),
+      PStackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) :
+                            MRI.createVirtualRegister(RegClass),
+      PCountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) :
+                            MRI.createVirtualRegister(RegClass),
+      LStackReg = InProlog ? getX86SubSuperRegister(X86::RCX, RType) :
+                            MRI.createVirtualRegister(RegClass),
+      LCountReg = InProlog ? getX86SubSuperRegister(X86::RDX, RType) :
+                            MRI.createVirtualRegister(RegClass);
+
+  auto SPReg = getX86SubSuperRegister(X86::RSP, RType);
+
+  if (InProlog) {
+    ContinueMBB->addLiveIn(InputReg);
+    LoopMBB->addLiveIn(InputReg);
+    LoopMBB->addLiveIn(LCountReg);
+    LoopMBB->addLiveIn(LStackReg);
+
+    for (MachineBasicBlock::livein_iterator i = MBB.livein_begin(),
+                                            e = MBB.livein_end();
+         i != e; i++) {
+      if (!LoopMBB->isLiveIn(*i))
+        LoopMBB->addLiveIn(*i);
+      if (!ContinueMBB->isLiveIn(*i))
+        ContinueMBB->addLiveIn(*i);
+    }
   } else {
-    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+    BuildMI(&MBB, DL, TII.get(X86::COPY), SizeReg).addReg(InputReg);
   }
 
-  unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
-  unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
-  CI.addReg(AX, RegState::Implicit)
-      .addReg(SP, RegState::Implicit)
-      .addReg(AX, RegState::Define | RegState::Implicit)
-      .addReg(SP, RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+  BuildMI(&MBB, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), CountReg).addReg(SizeReg);
+  BuildMI(&MBB, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackReg).addReg(SPReg);
+
+  if (!InProlog) {
+    BuildMI(LoopMBB, DL, TII.get(X86::PHI), PCountReg)
+        .addReg(CountReg)
+        .addMBB(&MBB)
+        .addReg(LCountReg)
+        .addMBB(LoopMBB);
+    BuildMI(LoopMBB, DL, TII.get(X86::PHI), PStackReg)
+        .addReg(StackReg)
+        .addMBB(&MBB)
+        .addReg(LStackReg)
+        .addMBB(LoopMBB);
+  }
 
-  if (Is64Bit) {
-    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
-    // themselves. It also does not clobber %rax so we can reuse it when
-    // adjusting %rsp.
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
+  BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8))
+    .addReg(PStackReg)
+    .addImm(1)
+    .addReg(0)
+    .addImm(0)
+    .addReg(0)
+    .addImm(0);
+
+  BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
+          LStackReg)
+      .addReg(PStackReg)
+      .addImm(GuardSize);
+
+  BuildMI(LoopMBB, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
+          LCountReg)
+      .addReg(PCountReg)
+      .addImm(GuardSize);
+
+  BuildMI(LoopMBB, DL, TII.get(X86::JAE_1)).addMBB(LoopMBB);
+
+  MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+  // Now that the probing is done, add code to continueMBB to update
+  // the stack pointer for real.
+  BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(Is64Bit ? X86::SUB64rr : X86::SUB32rr), SPReg)
+      .addReg(SPReg)
+      .addReg(SizeReg);
+
+  // Add the control flow edges we need.
+  MBB.addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(ContinueMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+
+  // Mark all the instructions added to the prolog as frame setup.
+  if (InProlog) {
+    for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+      BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineBasicBlock::iterator MI = LoopMBB->begin();
+         MI != LoopMBB->end(); ++MI) {
+      MI->setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+         CMBBI != ContinueMBBI; ++CMBBI) {
+      CMBBI->setFlag(MachineInstr::FrameSetup);
+    }
   }
+
+  return ContinueMBBI;
 }
 
 static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -608,10 +747,11 @@
 */
 
 void X86FrameLowering::emitPrologue(MachineFunction &MF,
-                                    MachineBasicBlock &MBB) const {
+                                    MachineBasicBlock &InMBB) const {
   assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
          "MF used frame lowering for wrong subtarget");
-  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineBasicBlock *MBB = &InMBB;
+  MachineBasicBlock::iterator MBBI = MBB->begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -641,7 +781,9 @@
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+  bool UseRedZone = false;
+  bool UseStackProbe =
+      (STI.isOSWindows() && !STI.isTargetMachO()) || MF.shouldProbeStack();
 
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
@@ -661,19 +803,26 @@
       !MFI->hasVarSizedObjects() && // No dynamic alloca.
       !MFI->adjustsStack() &&       // No calls.
       !IsWin64CC &&                 // Win64 has no Red Zone
+
+      !(UseStackProbe && StackSize > 128) && // Only use the Red Zone if we can
+                                             // fit the whole stack in it
+                                             // and thus stack probes won't be
+                                             // needed
+
       !usesTheStack(MF) &&          // Don't push and pop.
       !MF.shouldSplitStack()) {     // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
     MFI->setStackSize(StackSize);
+    UseRedZone = true;
   }
 
   // Insert stack pointer adjustment for later moving of return addr.  Only
   // applies to tail call optimized functions where the callee argument stack
   // size is bigger than the callers.
   if (TailCallReturnAddrDelta < 0) {
-    BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+    BuildStackAdjustment(*MBB, MBBI, DL, TailCallReturnAddrDelta,
                          /*InEpilogue=*/false)
         .setMIFlag(MachineInstr::FrameSetup);
   }
@@ -714,7 +863,7 @@
     MFI->setOffsetAdjustment(-NumBytes);
 
     // Save EBP/RBP into the appropriate stack slot.
-    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+    BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
       .addReg(MachineFramePtr, RegState::Kill)
       .setMIFlag(MachineInstr::FrameSetup);
 
@@ -722,24 +871,24 @@
       // Mark the place where EBP/RBP was saved.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      BuildCFI(MBB, MBBI, DL,
+      BuildCFI(*MBB, MBBI, DL,
                MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
+      BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createOffset(
                                   nullptr, DwarfFramePtr, 2 * stackGrowth));
     }
 
     if (NeedsWinCFI) {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+      BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
           .addImm(FramePtr)
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
     if (!IsWin64Prologue) {
       // Update EBP with the new base value.
-      BuildMI(MBB, MBBI, DL,
+      BuildMI(*MBB, MBBI, DL,
               TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
               FramePtr)
           .addReg(StackPtr)
@@ -750,7 +899,7 @@
       // Mark effective beginning of when frame pointer becomes valid.
       // Define the current CFA to use the EBP/RBP register.
       unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
-      BuildCFI(MBB, MBBI, DL,
+      BuildCFI(*MBB, MBBI, DL,
                MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
     }
 
@@ -765,7 +914,7 @@
   bool PushedRegs = false;
   int StackOffset = 2 * stackGrowth;
 
-  while (MBBI != MBB.end() &&
+  while (MBBI != MBB->end() &&
          MBBI->getFlag(MachineInstr::FrameSetup) &&
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
@@ -777,13 +926,13 @@
       // Mark callee-saved push instruction.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      BuildCFI(MBB, MBBI, DL,
+      BuildCFI(*MBB, MBBI, DL,
                MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
       StackOffset += stackGrowth;
     }
 
     if (NeedsWinCFI) {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+      BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
           MachineInstr::FrameSetup);
     }
   }
@@ -793,13 +942,13 @@
   // Don't do this for Win64, it needs to realign the stack after the prologue.
   if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
-    BuildStackAlignAND(MBB, MBBI, DL, MaxAlign);
+    BuildStackAlignAND(*MBB, MBBI, DL, MaxAlign);
   }
 
   // If there is an SUB32ri of ESP immediately before this instruction, merge
   // the two. This can be the case when tail call elimination is enabled and
   // the callee has more arguments then the caller.
-  NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+  NumBytes -= mergeSPUpdates(*MBB, MBBI, true);
 
   // Adjust stack pointer: ESP -= numbytes.
 
@@ -815,69 +964,86 @@
   if (IsWin64Prologue && TRI->needsStackRealignment(MF))
     AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
   if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
-    // Check whether EAX is livein for this function.
-    bool isEAXAlive = isEAXLiveIn(MF);
+    assert(!UseRedZone && "The Red Zone is not accounted for in stack probes");
 
-    if (isEAXAlive) {
-      // Sanity check that EAX is not livein for this function.
-      // It should not be, so throw an assert.
-      assert(!Is64Bit && "EAX is livein in x64 case!");
+    uint64_t PageSize = 0x1000;
 
-      // Save EAX
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
-        .addReg(X86::EAX, RegState::Kill)
-        .setMIFlag(MachineInstr::FrameSetup);
-    }
+    // If we only need to probe 5 pages or below, we just emit instructions to
+    // do that instead of calling the function. This is just what the loop in
+    // the called function would do. 5 probes is what GCC will do before using
+    // a loop.
+    if (NumBytes <= 5 * PageSize) {
+      for (uint64_t i = 0; i < NumBytes / PageSize; ++i) {
+        BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8))
+          .addReg(StackPtr)
+          .addImm(1)
+          .addReg(0)
+          .addImm(- (i + 1) * PageSize)
+          .addReg(0)
+          .addImm(0)
+          .setMIFlag(MachineInstr::FrameSetup);
+      }
 
-    if (Is64Bit) {
-      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
-      // Function prologue is responsible for adjusting the stack pointer.
-      if (isUInt<32>(NumBytes)) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-            .addImm(NumBytes)
-            .setMIFlag(MachineInstr::FrameSetup);
-      } else if (isInt<32>(NumBytes)) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
-            .addImm(NumBytes)
-            .setMIFlag(MachineInstr::FrameSetup);
+      BuildMI(*MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
+              StackPtr)
+         .addReg(StackPtr)
+         .addImm(NumBytes)
+         .setMIFlag(MachineInstr::FrameSetup);
+
+    } else {
+      // We spill the registers we need for the stack probe loop.
+
+      bool RAXAlive, RCXAlive, RDXAlive;
+
+      // TODO: Push the registers if they are callee-saved.
+
+      pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes);
+      pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RCXAlive, X86::RCX, NumBytes);
+      pushRegForStackProbeCall(MF, *MBB, MBBI, DL, RDXAlive, X86::RDX, NumBytes);
+
+      if (Is64Bit) {
+        // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+        // Function prologue is responsible for adjusting the stack pointer.
+        if (isUInt<32>(NumBytes)) {
+          BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+              .addImm(NumBytes)
+              .setMIFlag(MachineInstr::FrameSetup);
+        } else if (isInt<32>(NumBytes)) {
+          BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+              .addImm(NumBytes)
+              .setMIFlag(MachineInstr::FrameSetup);
+        } else {
+          BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+              .addImm(NumBytes)
+              .setMIFlag(MachineInstr::FrameSetup);
+        }
       } else {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+        BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
             .addImm(NumBytes)
             .setMIFlag(MachineInstr::FrameSetup);
       }
-    } else {
-      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
-      // We'll also use 4 already allocated bytes for EAX.
-      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
-        .setMIFlag(MachineInstr::FrameSetup);
-    }
 
-    // Save a pointer to the MI where we set AX.
-    MachineBasicBlock::iterator SetRAX = MBBI;
-    --SetRAX;
+      // Save a pointer to the MI where we set AX.
+      MachineBasicBlock::iterator SetRAX = MBBI;
+      --SetRAX;
 
-    // Call __chkstk, __chkstk_ms, or __alloca.
-    emitStackProbeCall(MF, MBB, MBBI, DL);
+      // Emit the stack probes.
+      MachineInstr *NextInstr = emitStackProbes(MF, *MBB, MBBI, DL, true);
+      MBBI = NextInstr;
+      MBB = NextInstr->getParent();
 
-    // Apply the frame setup flag to all inserted instrs.
-    for (; SetRAX != MBBI; ++SetRAX)
-      SetRAX->setFlag(MachineInstr::FrameSetup);
+      // Now we restore the spilled registers from the stack
 
-    if (isEAXAlive) {
-      // Restore EAX
-      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
-                                              X86::EAX),
-                                      StackPtr, false, NumBytes - 4);
-      MI->setFlag(MachineInstr::FrameSetup);
-      MBB.insert(MBBI, MI);
+      popRegForStackProbeCall(MF, *MBB, MBBI, DL, RDXAlive, X86::RDX, NumBytes);
+      popRegForStackProbeCall(MF, *MBB, MBBI, DL, RCXAlive, X86::RCX, NumBytes);
+      popRegForStackProbeCall(MF, *MBB, MBBI, DL, RAXAlive, X86::RAX, NumBytes);
     }
   } else if (NumBytes) {
-    emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+    emitSPUpdate(*MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
   }
 
   if (NeedsWinCFI && NumBytes)
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+    BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
         .addImm(NumBytes)
         .setMIFlag(MachineInstr::FrameSetup);
 
@@ -885,19 +1051,19 @@
   if (IsWin64Prologue && HasFP) {
     SEHFrameOffset = calculateSetFPREG(NumBytes);
     if (SEHFrameOffset)
-      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+      addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
                    StackPtr, false, SEHFrameOffset);
     else
-      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr);
+      BuildMI(*MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr);
 
     if (NeedsWinCFI)
-      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+      BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
           .addImm(FramePtr)
           .addImm(SEHFrameOffset)
           .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+  while (MBBI != MBB->end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
     const MachineInstr *FrameInstr = &*MBBI;
     ++MBBI;
 
@@ -909,7 +1075,7 @@
           int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
           Offset += SEHFrameOffset;
 
-          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+          BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
               .addImm(Reg)
               .addImm(Offset)
               .setMIFlag(MachineInstr::FrameSetup);
@@ -919,7 +1085,7 @@
   }
 
   if (NeedsWinCFI)
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+    BuildMI(*MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
         .setMIFlag(MachineInstr::FrameSetup);
 
   // Realign stack after we spilled callee-saved registers (so that we'll be
@@ -927,7 +1093,7 @@
   // Win64 requires aligning the stack after the prologue.
   if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
-    BuildStackAlignAND(MBB, MBBI, DL, MaxAlign);
+    BuildStackAlignAND(*MBB, MBBI, DL, MaxAlign);
   }
 
   // If we need a base pointer, set it up here. It's whatever the value
@@ -937,14 +1103,14 @@
   if (TRI->hasBasePointer(MF)) {
     // Update the base pointer with the current stack pointer.
     unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
-    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+    BuildMI(*MBB, MBBI, DL, TII.get(Opc), BasePtr)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
     if (X86FI->getRestoreBasePointer()) {
       // Stash value of base pointer.  Saving RSP instead of EBP shortens
       // dependence chain. Used by SjLj EH.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
-      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+      addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(Opm)),
                    FramePtr, true, X86FI->getRestoreBasePointerOffset())
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
@@ -957,7 +1123,7 @@
       // other way around.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
       unsigned IgnoredFrameReg;
-      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true,
+      addRegOffset(BuildMI(*MBB, MBBI, DL, TII.get(Opm)), BasePtr, true,
                    getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(),
                                           IgnoredFrameReg))
           .addReg(FramePtr)
@@ -970,13 +1136,13 @@
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+      BuildCFI(*MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
                                   nullptr, -StackSize + stackGrowth));
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
     if (PushedRegs)
-      emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+      emitCalleeSavedFrameMoves(*MBB, MBBI, DL);
   }
 }
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -15047,7 +15047,7 @@
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
-               SplitStack;
+               SplitStack || MF.shouldProbeStack();
   SDLoc dl(Op);
 
   if (!Lower) {
@@ -15125,6 +15125,7 @@
 
     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
     Flag = Chain.getValue(1);
+
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
@@ -20622,14 +20623,12 @@
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
   DebugLoc DL = MI->getDebugLoc();
-
-  assert(!Subtarget->isTargetMachO());
-
-  Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI,
-                                                    DL);
-
+  MachineInstr *ResumeMI =
+    Subtarget->getFrameLowering()->emitStackProbes(*BB->getParent(), *BB, MI,
+                                                    DL, false);
+  MachineBasicBlock *ResumeBB = ResumeMI->getParent();
   MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
+  return ResumeBB;
 }
 
 MachineBasicBlock *
Index: test/CodeGen/X86/dynamic-alloca-in-entry.ll
===================================================================
--- test/CodeGen/X86/dynamic-alloca-in-entry.ll
+++ test/CodeGen/X86/dynamic-alloca-in-entry.ll
@@ -6,7 +6,7 @@
   ret void
 }
 ; CHECK-LABEL: _foo:
-; CHECK: calll __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 ; CHECK: retl
 
 ; Use of inalloca implies that that the alloca is not static.
@@ -15,5 +15,5 @@
   ret void
 }
 ; CHECK-LABEL: _bar:
-; CHECK: calll __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 ; CHECK: retl
Index: test/CodeGen/X86/inalloca-ctor.ll
===================================================================
--- test/CodeGen/X86/inalloca-ctor.ll
+++ test/CodeGen/X86/inalloca-ctor.ll
@@ -13,7 +13,7 @@
   %args = alloca inalloca %frame
   %c = getelementptr %frame, %frame* %args, i32 0, i32 2
 ; CHECK: movl    $20, %eax
-; CHECK: calll   __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 ; CHECK: movl %esp,
   call void @Foo_ctor(%Foo* %c)
 ; CHECK: leal 12(%{{.*}}),
Index: test/CodeGen/X86/inalloca-invoke.ll
===================================================================
--- test/CodeGen/X86/inalloca-invoke.ll
+++ test/CodeGen/X86/inalloca-invoke.ll
@@ -21,7 +21,7 @@
   %beg = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 0
   %end = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 1
 
-; CHECK:  calll   __chkstk
+; CHECK:  or{{.}}     $0, {{.*}}
 ; CHECK:  movl %esp, %[[beg:[^ ]*]]
 ; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 
Index: test/CodeGen/X86/inalloca-stdcall.ll
===================================================================
--- test/CodeGen/X86/inalloca-stdcall.ll
+++ test/CodeGen/X86/inalloca-stdcall.ll
@@ -9,7 +9,7 @@
 ; CHECK-LABEL: _g:
   %b = alloca inalloca %Foo
 ; CHECK: movl    $8, %eax
-; CHECK: calll   __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
   %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
Index: test/CodeGen/X86/inalloca.ll
===================================================================
--- test/CodeGen/X86/inalloca.ll
+++ test/CodeGen/X86/inalloca.ll
@@ -9,7 +9,7 @@
 entry:
   %b = alloca inalloca %Foo
 ; CHECK: movl    $8, %eax
-; CHECK: calll   __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
   %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
@@ -28,7 +28,7 @@
 entry:
   %b = alloca inalloca %Foo
 ; CHECK: movl    $8, %eax
-; CHECK: calll   __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
   %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
@@ -48,7 +48,7 @@
 entry:
   %b = alloca inalloca %Foo
 ; CHECK: movl    $8, %eax
-; CHECK: calll   __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
   %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
Index: test/CodeGen/X86/mem-intrin-base-reg.ll
===================================================================
--- test/CodeGen/X86/mem-intrin-base-reg.ll
+++ test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -65,7 +65,7 @@
 ; CHECK: movl %esp, %esi
 ; CHECK: pushl $128
 ; CHECK: calll _memcpy
-; CHECK: calll __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 
 ; stosd doesn't clobber esi, so we can use it.
 
Index: test/CodeGen/X86/mingw-alloca.ll
===================================================================
--- test/CodeGen/X86/mingw-alloca.ll
+++ test/CodeGen/X86/mingw-alloca.ll
@@ -6,9 +6,9 @@
 define void @foo1(i32 %N) nounwind {
 entry:
 ; COFF: _foo1:
-; COFF: calll __alloca
+; COFF: or{{.}}     $0, {{.*}}
 ; ELF: foo1:
-; ELF: calll _alloca
+; ELF: or{{.}}     $0, {{.*}}
 	%tmp14 = alloca i32, i32 %N		; <i32*> [#uses=1]
 	call void @bar1( i32* %tmp14 )
 	ret void
@@ -20,14 +20,10 @@
 entry:
 ; COFF: _foo2:
 ; COFF: andl $-16, %esp
-; COFF: pushl %eax
-; COFF: calll __alloca
-; COFF: movl	8028(%esp), %eax
+; COFF: or{{.}}     $0, {{.*}}
 ; ELF: foo2:
 ; ELF: andl $-16, %esp
-; ELF: pushl %eax
-; ELF: calll _alloca
-; ELF: movl	8028(%esp), %eax
+; ELF: or{{.}}     $0, {{.*}}
 	%A2 = alloca [2000 x i32], align 16		; <[2000 x i32]*> [#uses=1]
 	%A2.sub = getelementptr [2000 x i32], [2000 x i32]* %A2, i32 0, i32 0		; <i32*> [#uses=1]
 	call void @bar2( i32* %A2.sub, i32 %N )
Index: test/CodeGen/X86/movtopush.ll
===================================================================
--- test/CodeGen/X86/movtopush.ll
+++ test/CodeGen/X86/movtopush.ll
@@ -67,7 +67,6 @@
 
 ; If we have a reserved frame, we should have pushes
 ; NORMAL-LABEL: test2:
-; NORMAL-NOT: subl {{.*}} %esp
 ; NORMAL: pushl   $4
 ; NORMAL-NEXT: pushl   $3
 ; NORMAL-NEXT: pushl   $2
Index: test/CodeGen/X86/pr17631.ll
===================================================================
--- test/CodeGen/X86/pr17631.ll
+++ test/CodeGen/X86/pr17631.ll
@@ -18,7 +18,7 @@
 
 ; CHECK: equal
 ; CHECK-NOT: vzeroupper
-; CHECK: _chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 ; CHECK: ret
 
 define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
Index: test/CodeGen/X86/stack-probe-size.ll
===================================================================
--- test/CodeGen/X86/stack-probe-size.ll
+++ test/CodeGen/X86/stack-probe-size.ll
@@ -11,17 +11,6 @@
 
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 
-define i32 @test1() "stack-probe-size"="0" {
-  %buffer = alloca [4095 x i8]
-
-  ret i32 0
-
-; CHECK-LABEL: _test1:
-; CHECK-NOT: subl $4095, %esp
-; CHECK: movl $4095, %eax
-; CHECK: calll __chkstk
-}
-
 define i32 @test2() {
   %buffer = alloca [4095 x i8]
 
@@ -30,7 +19,7 @@
 ; CHECK-LABEL: _test2:
 ; CHECK-NOT: movl $4095, %eax
 ; CHECK: subl $4095, %esp
-; CHECK-NOT: calll __chkstk
+; CHECK-NOT: or{{.}}     $0, {{.*}}
 }
 
 define i32 @test3() "stack-probe-size"="8192" {
@@ -41,7 +30,7 @@
 ; CHECK-LABEL: _test3:
 ; CHECK-NOT: movl $4095, %eax
 ; CHECK: subl $4095, %esp
-; CHECK-NOT: calll __chkstk
+; CHECK-NOT: or{{.}}     $0, {{.*}}
 }
 
 define i32 @test4() "stack-probe-size"="0" {
@@ -51,8 +40,7 @@
 
 ; CHECK-LABEL: _test4:
 ; CHECK-NOT: subl $4096, %esp
-; CHECK: movl $4096, %eax
-; CHECK: calll __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 }
 
 define i32 @test5() {
@@ -62,8 +50,7 @@
 
 ; CHECK-LABEL: _test5:
 ; CHECK-NOT: subl $4096, %esp
-; CHECK: movl $4096, %eax
-; CHECK: calll __chkstk
+; CHECK: or{{.}}     $0, {{.*}}
 }
 
 define i32 @test6() "stack-probe-size"="8192" {
@@ -74,5 +61,5 @@
 ; CGECK-LABEL: _test6:
 ; CGECK-NOT: movl $4096, %eax
 ; CGECK: subl $4096, %esp
-; CGECK-NOT: calll __chkstk
+; CGECK-NOT: or{{.}}     $0, {{.*}}
 }
Index: test/CodeGen/X86/stack-probes.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/stack-probes.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X86-LINUX %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X64-LINUX %s
+
+declare void @use([40096 x i8]*)
+
+; Ensure calls to __probestack occur for large stack frames
+define void @test() "probe-stack" {
+  %array = alloca [40096 x i8]
+  call void @use([40096 x i8]* %array)
+  ret void
+
+; X86-LINUX-LABEL: test:
+; X86-LINUX:       movl    $40108, %eax            # imm = 0x9CAC
+; X86-LINUX-NEXT:  movl    %eax, %edx
+; X86-LINUX-NEXT:  movl    %esp, %ecx
+; X86-LINUX-LABEL: .LBB0_1
+; X86-LINUX-NEXT:  orl     $0, (%ecx)
+; X86-LINUX-NEXT:  subl    $4096, %ecx
+; X86-LINUX-NEXT:  subl    $4096, %edx
+; X86-LINUX-NEXT:  jae     .LBB0_1
+; X86-LINUX:       subl    %eax, %esp
+; X86-LINUX:       addl    $40108, %esp            # imm = 0x9CAC
+
+; X64-LINUX-LABEL: test:
+; X64-LINUX:       movl    $40104, %eax            # imm = 0x9CA8
+; X64-LINUX-NEXT:  movq    %rax, %rdx
+; X64-LINUX-NEXT:  movq    %rsp, %rcx
+; X64-LINUX-LABEL: .LBB0_1
+; X64-LINUX-NEXT:  orq     $0, (%rcx)
+; X64-LINUX-NEXT:  subq    $4096, %rcx
+; X64-LINUX-NEXT:  subq    $4096, %rdx
+; X64-LINUX-NEXT:  jae     .LBB0_1
+; X64-LINUX:       subq    %rax, %rsp
+; X64-LINUX:       addq    $40104, %rsp            # imm = 0x9CA8
+
+}
+
+declare void @useFast([4096 x i8]*)
+
+; Ensure the stack is probed for medium stack frames
+define void @testFast() "probe-stack" {
+  %array = alloca [4096 x i8]
+  call void @useFast([4096 x i8]* %array)
+  ret void
+
+; X86-LINUX-LABEL: testFast:
+; X86-LINUX:       orl     $0, -4096(%esp)
+; X86-LINUX-NEXT:  subl    $4108, %esp             # imm = 0x100C
+
+; X64-LINUX-LABEL: testFast:
+; X64-LINUX:       orq     $0, -4096(%rsp)
+; X64-LINUX-NEXT:  subq    $4104, %rsp             # imm = 0x1008
+
+}
Index: test/CodeGen/X86/win64_alloca_dynalloca.ll
===================================================================
--- test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32 -code-model=large | FileCheck %s -check-prefix=L64
 ; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
@@ -14,23 +13,15 @@
   %buf0 = alloca i8, i64 4096, align 1
 
 ; ___chkstk_ms does not adjust %rsp.
-; M64:       $4096, %eax
-; M64: callq ___chkstk_ms
-; M64: subq  %rax, %rsp
+; M64: or{{.}}     $0, {{.*}}
+; M64: subq  {{.*}}, %rsp
 ; M64: leaq 128(%rsp), %rbp
 
 ; __chkstk does not adjust %rsp.
-; W64:       $4096, %eax
-; W64: callq __chkstk
-; W64: subq  %rax, %rsp
+; W64: or{{.}}     $0, {{.*}}
+; W64: subq  {{.*}}, %rsp
 ; W64: leaq 128(%rsp), %rbp
 
-; Use %r11 for the large model.
-; L64:       $4096, %eax
-; L64: movabsq $__chkstk, %r11
-; L64: callq *%r11
-; L64: subq  %rax, %rsp
-
 ; Freestanding
 ; EFI:       $[[B0OFS:4096|4104]], %rsp
 ; EFI-NOT:   call
@@ -39,23 +30,16 @@
 
 ; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
-; M64: callq ___chkstk_ms
-; M64: subq  %rax, %rsp
+; M64: or{{.}}     $0, {{.*}}
+; M64: subq  {{.*}}, %rsp
 ; M64: movq  %rsp, %rax
 
 ; W64: leaq  15(%{{.*}}), %rax
 ; W64: andq  $-16, %rax
-; W64: callq __chkstk
-; W64: subq  %rax, %rsp
+; W64: or{{.}}     $0, {{.*}}
+; W64: subq  {{.*}}, %rsp
 ; W64: movq  %rsp, %rax
 
-; L64: leaq  15(%{{.*}}), %rax
-; L64: andq  $-16, %rax
-; L64: movabsq $__chkstk, %r11
-; L64: callq *%r11
-; L64: subq  %rax, %rsp
-; L64: movq  %rsp, %rax
-
 ; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
 ; EFI: movq  %rsp, [[R64:%r.*]]
@@ -97,16 +81,16 @@
 
 ; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
-; M64: callq ___chkstk_ms
-; M64: subq  %rax, %rsp
+; M64: or{{.}}     $0, {{.*}}
+; M64: subq  {{.*}}, %rsp
 ; M64: movq  %rsp, [[R2:%r.*]]
 ; M64: andq  $-128, [[R2]]
 ; M64: movq  [[R2]], %rsp
 
 ; W64: leaq  15(%{{.*}}), %rax
 ; W64: andq  $-16, %rax
-; W64: callq __chkstk
-; W64: subq  %rax, %rsp
+; W64: or{{.}}     $0, {{.*}}
+; W64: subq  {{.*}}, %rsp
 ; W64: movq  %rsp, [[R2:%r.*]]
 ; W64: andq  $-128, [[R2]]
 ; W64: movq  [[R2]], %rsp
Index: test/CodeGen/X86/win64_eh.ll
===================================================================
--- test/CodeGen/X86/win64_eh.ll
+++ test/CodeGen/X86/win64_eh.ll
@@ -37,9 +37,7 @@
 }
 ; WIN64-LABEL: foo2:
 ; WIN64: .seh_proc foo2
-; WIN64: movl $8000, %eax
-; WIN64: callq {{__chkstk|___chkstk_ms}}
-; WIN64: subq %rax, %rsp
+; WIN64: or{{.}}     $0, {{.*}}
 ; WIN64: .seh_stackalloc 8000
 ; WIN64: .seh_endprologue
 ; WIN64: addq $8000, %rsp
Index: test/CodeGen/X86/win64_frame.ll
===================================================================
--- test/CodeGen/X86/win64_frame.ll
+++ test/CodeGen/X86/win64_frame.ll
@@ -103,7 +103,7 @@
   ; CHECK:        leaq    15(,%rax,4), %rcx
   ; CHECK:        movabsq $34359738352, %rax
   ; CHECK:        andq    %rcx, %rax
-  ; CHECK:        callq   __chkstk
+  ; CHECK:        or{{.}}     $0, {{.*}}
   ; CHECK:        subq    %rax, %rsp
 
   %gep = getelementptr [300 x i8], [300 x i8]* %alloca, i32 0, i32 0
Index: test/CodeGen/X86/win_chkstk.ll
===================================================================
--- test/CodeGen/X86/win_chkstk.ll
+++ test/CodeGen/X86/win_chkstk.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN_X32
-; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN_X64
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -code-model=large | FileCheck %s -check-prefix=WIN64_LARGE
-; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X32
-; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X64
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN
+; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=WIN
+; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN
 ; RUN: llc < %s -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mtriple=x86_64-pc-win32-macho | FileCheck %s -check-prefix=LINUX
 
@@ -15,13 +14,8 @@
 ; Stack allocation >= 4096 bytes will require call to __chkstk in the Windows ABI.
 define i32 @main4k() nounwind {
 entry:
-; WIN_X32:    calll __chkstk
-; WIN_X64:    callq __chkstk
-; WIN64_LARGE: movabsq $__chkstk, %r11
-; WIN64_LARGE: callq *%r11
-; MINGW_X32:  calll __alloca
-; MINGW_X64:  callq ___chkstk_ms
-; LINUX-NOT:  call __chkstk
+; WIN: or{{.}}     $0, {{.*}}
+; LINUX-NOT: or{{[ql]}}     $0, {{.*}}
   %array4096 = alloca [4096 x i8], align 16       ; <[4096 x i8]*> [#uses=0]
   ret i32 0
 }
@@ -30,21 +24,8 @@
 ; allocation.
 define i32 @main128() nounwind {
 entry:
-; WIN_X32:       # BB#0:
-; WIN_X32-NOT:   calll __chkstk
-; WIN_X32:       ret
-
-; WIN_X64:       # BB#0:
-; WIN_X64-NOT:   callq __chkstk
-; WIN_X64:       ret
-
-; MINGW_X64:     # BB#0:
-; MINGW_X64-NOT: callq ___chkstk_ms
-; MINGW_X64:     ret
-
-; LINUX:         # BB#0:
-; LINUX-NOT:     call __chkstk
-; LINUX:         ret
+; WIN-NOT: or{{.}}     $0, {{.*}}
+; LINUX-NOT: or{{.}}     $0, {{.*}}
   %array128 = alloca [128 x i8], align 16         ; <[128 x i8]*> [#uses=0]
   ret i32 0
 }
@@ -53,13 +34,8 @@
 ; caller has the Win64 calling convention.
 define x86_64_win64cc i32 @main4k_win64() nounwind {
 entry:
-; WIN_X32:    calll __chkstk
-; WIN_X64:    callq __chkstk
-; WIN64_LARGE: movabsq $__chkstk, %r11
-; WIN64_LARGE: callq *%r11
-; MINGW_X32:  calll __alloca
-; MINGW_X64:  callq ___chkstk_ms
-; LINUX-NOT:  call __chkstk
+; WIN: or{{.}}     $0, {{.*}}
+; LINUX-NOT: or{{.}}     $0, {{.*}}
   %array4096 = alloca [4096 x i8], align 16       ; <[4096 x i8]*> [#uses=0]
   ret i32 0
 }
Index: test/CodeGen/X86/windows-itanium-alloca.ll
===================================================================
--- test/CodeGen/X86/windows-itanium-alloca.ll
+++ test/CodeGen/X86/windows-itanium-alloca.ll
@@ -12,5 +12,4 @@
   ret void
 }
 
-; CHECK: __chkstk
-
+; CHECK: orl     $0, (%{{.*}})