Index: lib/Target/X86/CMakeLists.txt
===================================================================
--- lib/Target/X86/CMakeLists.txt
+++ lib/Target/X86/CMakeLists.txt
@@ -17,6 +17,7 @@
   X86CallFrameOptimization.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp
+  X86FixupSeparateStack.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
   X86ISelDAGToDAG.cpp
Index: lib/Target/X86/X86.h
===================================================================
--- lib/Target/X86/X86.h
+++ lib/Target/X86/X86.h
@@ -78,6 +78,10 @@
 /// in order to eliminate partial register usage, false dependences on
 /// the upper portions of registers, and to save code size.
 FunctionPass *createX86FixupBWInsts();
+
+/// Return a pass that helps to support separate stack and data segments by
+/// emitting segment override prefixes where necessary.
+FunctionPass *createX86FixupSeparateStack();
 } // End llvm namespace
 
 #endif
Index: lib/Target/X86/X86.td
===================================================================
--- lib/Target/X86/X86.td
+++ lib/Target/X86/X86.td
@@ -247,6 +247,10 @@
 def FeatureFastPartialYMMWrite
     : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
                        "true", "Partial writes to YMM registers are fast">;
+def FeatureSeparateStackSeg
+    : SubtargetFeature<"separate-stack-seg", "UseSeparateStackSeg", "true",
+                       "Help to support using different stack and data segments"
+                       " by adding segment override prefixes (X86-32 only).">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
Index: lib/Target/X86/X86FixupSeparateStack.cpp
===================================================================
--- /dev/null
+++ lib/Target/X86/X86FixupSeparateStack.cpp
@@ -0,0 +1,1079 @@
+//===----- X86FixupSeparateStack.cpp - Fixup stack memory accesses --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that emits segment override prefixes to help support
+// separate stack and data segments for X86-32.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-separate-stack"
+
+enum class SPStoreVerif { None, IgnoreVA, All };
+
+static cl::opt<SPStoreVerif> SPStoreVerification(
+    "sep-stk-seg-sp-store-verif",
+    cl::Hidden, cl::init(SPStoreVerif::All),
+    cl::desc("Verification for stores of pointers pointing into SS segment"),
+    cl::values(clEnumValN(SPStoreVerif::None, "none",
+                          "No verification"),
+               clEnumValN(SPStoreVerif::IgnoreVA, "ignore-va",
+                          "Verify only in functions that do not have a va_start"),
+               clEnumValN(SPStoreVerif::All, "all",
+                          "Verify in all functions"),
+               clEnumValEnd));
+
+namespace {
+/// Records information about registers that directly or indirectly are used in
+/// memory operands within a basic block.  It is used to distinguish registers
+/// that point to stack memory from those that do not.  It only explicitly
+/// stores information about registers that point to the stack, which implies
+/// that all other physical registers do not point to the stack.
+class AddrRegs {
+private:
+  const X86RegisterInfo *TRI;
+
+public:
+  /// Registers that point to the stack
+  DenseSet<unsigned> Stack;
+
+  AddrRegs(const X86RegisterInfo *TRI_) : TRI(TRI_) {}
+
+  void insertStackReg(unsigned Reg);
+
+  /// \brief Return true iff Reg points to a stack location.
+  bool pointsIntoStack(unsigned Reg) const;
+};
+
+/// Requirements for how registers that are directly or indirectly used in
+/// memory operands are configured.
+///
+/// Each basic block is associated with one AddrRegReqs object, although that
+/// object may actually contain information about the successors of the basic
+/// block with which it is associated.
+class AddrRegReqs {
+private:
+  const X86RegisterInfo *TRI;
+
+  /// The collection of AddrRegReqs objects that is associated with a function
+  /// is structured as a directed graph that may contain cycles.  This set of
+  /// Predecessors points to the AddrRegReqs objects corresponding to all of the
+  /// basic blocks that immediately precede the one associated with this object
+  /// and that have already been visited or are currently being visited by this
+  /// pass.
+  std::set<AddrRegReqs *> Predecessors;
+  /// The original status of the registers at the time this object branched from
+  /// its first predecessor, according to the traversal order of this pass.
+  const AddrRegs OrigRegs;
+
+  /// Pairs of registers (A, B) in which A may derive a pointer to stack data
+  /// from B.  This is the case if an instruction form that the compiler selects
+  /// to compute a pointer to stack data uses B to define A.  See
+  /// X86FixupSeparateStack::mayComputeStackPtr for more details.
+  ///
+  /// This also indicates cancellations of implicit derivations.  Every physical
+  /// register is implicitly derived from itself unless an entry in Derivatives
+  /// indicates otherwise.  Specifically, it is as though Derivatives contains
+  /// (A, A) for all physical registers A unless there exists an entry (A, X) in
+  /// Derivatives for some physical register X or the special value
+  /// X86::NoRegister.  This special value indicates that A cannot possibly
+  /// point to stack data.
+  ///
+  /// Note that the presence of an entry (A, B) in Derivatives does NOT
+  /// automatically indicate that A points to stack data.  It only indicates
+  /// that if B pointed to stack data when this basic block was entered, then A
+  /// points to stack data at the current instruction.  Also keep in mind that
+  /// Derivatives gets updated as additional instructions in the basic block are
+  /// processed.
+  ///
+  /// The entry (ESP, ESP) is always implicitly (and never explicitly) present
+  /// in Derivatives.
+  std::vector<std::pair<unsigned, unsigned> > Derivatives;
+  /// Root registers that are used directly or indirectly in memory operands
+  DenseSet<unsigned> UsedInMemOp;
+
+  /// \brief Lookup the register from which To may derive a pointer to stack
+  /// data, if such a register exists.
+  ///
+  /// \returns The register Root if an implicit or explicit entry (To, Root) is
+  /// currently present in Derivatives.  Otherwise, returns X86::NoRegister.
+  /// Note that there may be an entry (To, X86::NoRegister) in Derivatives,
+  /// which would also result in X86::NoRegister being returned.
+  unsigned lookupRoot(unsigned To) const;
+
+public:
+  /// \brief Construct an object with no initial predecessor.  One or more
+  /// predecessors may still be added later with addPredecessor.
+  AddrRegReqs(const X86RegisterInfo *TRI_) : TRI(TRI_), OrigRegs(TRI_) {}
+  /// \brief Construct an object with one initial predecessor.  Initialize
+  /// OrigRegs to represent the register configuration at the end of
+  /// Predecessor, since this is only invoked after Predecessor has been
+  /// processed.
+  AddrRegReqs(AddrRegReqs *Predecessor);
+
+  /// \brief Extract the register configuration represented by the current state
+  /// of this object.  This is based on the contents of OrigRegs, and it is
+  /// adjusted to account for the contents of Derivatives.
+  AddrRegs extractRegs();
+
+  /// \brief Register a new predecessor for this object.  This involves the
+  /// following steps:
+  /// 1. Insert Predecessor into Predecessors so that any time useInMemOp is
+  ///    invoked on this object, it may also invoke Predecessor->useInMemOp as
+  ///    necessary.
+  /// 2. Invoke Predecessor->useInMemOp for each register in UsedInMemOp, since
+  ///    the basic block corresponding to this object uses those registers even
+  ///    though the instructions that are in the predecessor basic block may not
+  ///    have used those registers.
+  void addPredecessor(AddrRegReqs *Predecessor);
+
+  /// \brief Record that To is derived from From in Derivatives.
+  void derive(unsigned From, unsigned To);
+
+  /// \brief Record that To is redefined by some operation that cannot be used
+  /// to derive a pointer into the stack.
+  void markNotDerived(unsigned To);
+
+  /// \brief Record that Reg is used as a base address register in a memory
+  /// operand and check that the requirement for this to be a stack or data
+  /// pointer is met.
+  ///
+  /// This routine is invoked recursively for each predecessor of this object,
+  /// and there may even be cycles.  However, the recursion will eventually
+  /// terminate.  The routine is only invoked recursively if an entry is added
+  /// to this->UsedInMemOp.  The only entries that can be added are those
+  /// computed from the contents of Derivatives and OrigRegs.  The resultant set
+  /// is a subset of the set of physical registers, which is finite.  Entries
+  /// are never removed from UsedInMemOp by this routine.  Thus, this routine
+  /// will eventually run out of entries to be added to UsedInMemOp.
+  ///
+  /// \param Reg Register that is used as a base address register in a memory
+  /// operand.
+  /// \param IsStackReg Set to true iff Reg is required to point to stack
+  /// memory.  Otherwise, Reg is required to point to data memory.
+  void useInMemOp(unsigned Reg, bool IsStackReg);
+
+  /// \brief Return true iff Reg points to a stack location at the current
+  /// instruction.
+  bool pointsIntoStack(unsigned Reg) const;
+};
+
+/// Requirements for how register spills of pointers to the stack segment are
+/// arranged in the frame.
+class StackPtrSpillReqs {
+private:
+  std::set<StackPtrSpillReqs *> Predecessors;
+
+  /// Frame slots that have been used in the associated basic block or one or
+  /// more of its successors to fill registers with stack pointers or other
+  /// data.
+  std::map<int64_t, bool> Demand;
+
+  /// Frame slots that have stack pointers or other data spilled into them by
+  /// instructions in the function associated with this object.
+  std::map<int64_t, bool> Supply;
+
+  /// True if this object is associated with the function entrypoint.
+  bool Top;
+
+  enum class DemandResponse { Unsatisfied, SatisfiedSP, SatisfiedNonSP };
+
+  /// \brief Check whether a spill slot currently contains a stack pointer or
+  /// other data.
+  DemandResponse demand(int64_t Start, uint64_t Size,
+                        std::set<StackPtrSpillReqs *> Demanders);
+
+public:
+  /// \brief Construct an object with no initial predecessor.  One or more
+  /// predecessors may still be added later with addPredecessor.
+  StackPtrSpillReqs() : Top(true) {}
+  /// \brief Construct an object with one initial predecessor.
+  StackPtrSpillReqs(StackPtrSpillReqs *Predecessor);
+
+  /// \brief Register a new predecessor for this object.  This involves the
+  /// following steps:
+  /// 1. Insert Predecessor into Predecessors so that any time demand is
+  ///    invoked on this object, it may also invoke Predecessor->demand as
+  ///    necessary.
+  /// 2. Invoke Predecessor->demand for each frame slot demanded by this set of
+  ///    requirements.
+  void addPredecessor(StackPtrSpillReqs *Predecessor);
+
+  /// \brief Record that either a stack pointer or other data is spilled into
+  /// the specified slot by an instruction in the function associated with this
+  /// object.
+  void supply(bool SP, int64_t Start, uint64_t Size);
+
+  /// \brief Check whether the specified slot most recently had stack pointer
+  /// data spilled into it.
+  bool isSPSpill(int64_t Start, uint64_t Size);
+};
+
+/// This pass performs a depth-first traversal of each function, visiting each
+/// basic block once.
+///
+/// To understand how this pass determines whether its transformation is
+/// correct, it may be helpful to think of each basic block as an electronic
+/// circuit with 12V and 5V inputs.  Those inputs take the form of register
+/// values fed into basic blocks.  Think of pointers to stack data as 12V
+/// inputs and other register values as 5V inputs.
+///
+/// Continuing this analogy, the problem that this pass seeks to solve is to
+/// determine for each circuit element whether it is always wired to 12V or 5V.
+/// This pass considers each instruction to be analogous to a circuit element.
+/// Of course, each basic block may be invoked after multiple predecessors.
+/// Think of this like a circuit that can have multiple copies of a particular
+/// circuit block comprising circuit elements such as resistors and capacitors,
+/// and all replicas of a particular element must use the same voltage inputs.
+///
+/// An assumption that helps to make the problem more tractable for this pass
+/// is that pointers to data on the stack are only computed using a couple of
+/// instruction forms.  This is analogous to a specific material being used to
+/// carry 12V in a circuit.  It is still possible that a 5V input signal could
+/// be connected to a 12V-capable trace, but a 12V input signal will never be
+/// connected to a 5V trace.  It is possible to determine which inputs may be
+/// connected to 12V input signals just by looking at a particular circuit block
+/// without needing to know how all replicas of it will be connected in the
+/// whole circuit.  Similarly, it is possible to determine which register inputs
+/// to a particular instruction may point to stack data by analyzing just the
+/// instructions preceding that one in its basic block.
+///
+/// Ultimately, it is still necessary to analyze the control flow between basic
+/// blocks to make sure that for each base address register used in each memory
+/// operand, the register consistently refers to either stack data or non-stack
+/// data for every control flow that contains the corresponding instruction.
+/// This is analogous to checking circuit blocks to make sure that for each
+/// 12V-capable trace leading to each circuit element, all replicas are
+/// connected to 12V input signals or all replicas are connected to 5V input
+/// signals.  For efficiency, this pass only traverses each basic block once.
+/// The connectivity check is performed by recording the predecessors of each
+/// basic block and tracing a particular register value back through the chain
+/// of derivations that produced it to check the property described above.  This
+/// is analogous to tracing a wire connected to a circuit element back to its
+/// source.
+///
+/// The SafeStack instrumentation pass will move any objects to the unsafe stack
+/// that could have pointers to them passed between functions.  This is
+/// necessary, since the X86FixupSeparateStack pass only performs
+/// intra-procedural analysis.  The runtime can place the unsafe stack outside
+/// of the stack segment, so that accesses to it do not require segment override
+/// prefixes.
+///
+/// Even with the SafeStack pass enabled, va_list objects containing pointers to
+/// the stack may still be passed between C/C++ functions.  That is why the
+/// Clang frontend recognizes when this pass is enabled and adds the appropriate
+/// segment override prefixes to __builtin_va_arg invocations.
+///
+class X86FixupSeparateStack : public MachineFunctionPass {
+public:
+  X86FixupSeparateStack() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  const char *getPassName() const override {
+    return "X86 Fixup Separate Stack";
+  }
+
+  const TargetInstrInfo *TII;
+  const X86RegisterInfo *TRI;
+  const X86Subtarget *STI;
+  unsigned FrameReg;
+  unsigned StackReg;
+  bool HasVAStart;
+  int VAStartFI;
+  static char ID;
+
+  /// Map from basic block to corresponding AddrRegReqs and StackPtrSpillReqs
+  /// objects
+  std::map<MachineBasicBlock *,
+           std::pair<std::shared_ptr<AddrRegReqs>,
+                     std::shared_ptr<StackPtrSpillReqs> > > Reqs;
+
+  /// \brief Perform fixups for a string instruction.
+  ///
+  /// \param I Iterator pointing to the string instruction in the basic block.
+  ///          This routine may add new instructions around the string
+  ///          instruction, in which case 'I' will point to the instruction that
+  ///          was added at the highest address.
+  /// \param StringDS true if the instruction accesses DS
+  /// \param StringES true if the instruction accesses ES
+  ///
+  /// \returns true if the basic block was changed
+  bool processStringInstr(MachineBasicBlock *BB, MachineBasicBlock::iterator &I,
+                          AddrRegReqs &AReqs, bool StringDS, bool StringES);
+
+  /// \brief Perform fixups for an instruction that accesses memory with an
+  /// explicit memory operand.
+  ///
+  /// \param MemoryOperand Offset to the memory operand within the instruction
+  ///
+  /// \returns true if the instruction was changed
+  bool processMemAccessInstr(MachineInstr &I, int MemoryOperand,
+                             AddrRegReqs &AReqs);
+
+  enum FrameOp {
+    NoFrameOp, // or an irrelevant frame operation
+    SpillReg,
+    FillRegWithStackPtr
+  };
+
+  /// \brief Check whether an instruction is a register spill or fill and, if
+  /// so, update SpilledStackPtrs appropriately.
+  ///
+  /// \param PossibleSP Set to true if this instruction may possibly spill or
+  /// fill a stack pointer.  Some instructions spill and fill registers, but
+  /// are assumed to not handle stack pointers (e.g. FP instructions).
+  ///
+  /// \returns type of frame operation performed by the instruction
+  FrameOp checkForRegisterSpill(MachineInstr &I, int MemoryOperand,
+                                bool PossibleSP,
+                                AddrRegReqs &AReqs, StackPtrSpillReqs &SReqs);
+
+  /// \brief Check whether an instruction is of a type that may be used to
+  /// compute a derived stack pointer.
+  ///
+  /// \returns X86::NoRegister if the instruction is not of a type that may be
+  /// used to compute a derived stack pointer.  Otherwise, returns the register
+  /// that is used as the source of the possible derivation.
+  unsigned mayComputeStackPtr(const MachineInstr &I) const;
+
+  /// \brief Process a basic block.
+  ///
+  /// \param StackGrowth Current size of the stack at this invocation point.
+  ///                    This is only tracked if this function has a va_start,
+  ///                    so that this can be used to detect instructions that
+  ///                    store the address of the variadic arguments to memory.
+  ///                    This needs to be detected specially, since storing a
+  ///                    pointer to stack memory is otherwise disallowed.
+  ///
+  /// \returns true if the basic block was changed
+  bool processBasicBlock(MachineBasicBlock *BB,
+                         int64_t StackGrowth,
+                         std::shared_ptr<AddrRegReqs> AReqs,
+                         std::shared_ptr<StackPtrSpillReqs> SReqs);
+};
+
+char X86FixupSeparateStack::ID = 0;
+}
+
+FunctionPass *llvm::createX86FixupSeparateStack() {
+  return new X86FixupSeparateStack();
+}
+
+void AddrRegs::insertStackReg(unsigned Reg) {
+  for (auto N : Stack)
+    if (TRI->regsOverlap(N, Reg))
+      return;
+
+  Stack.insert(Reg);
+}
+
+bool AddrRegs::pointsIntoStack(unsigned Reg) const {
+  if (!TRI->isPhysicalRegister(Reg))
+    return false;
+
+  if (TRI->regsOverlap(TRI->getStackRegister(), Reg))
+    // The stack register is assumed to always point to the stack
+    return true;
+
+  for (auto N : Stack)
+    if (TRI->regsOverlap(N, Reg))
+      return true;
+
+  return false;
+}
+
+AddrRegReqs::AddrRegReqs(AddrRegReqs *Predecessor) :
+  TRI(Predecessor->TRI),
+  OrigRegs(Predecessor->extractRegs()) {
+
+  Predecessors.insert(Predecessor);
+}
+
+AddrRegs AddrRegReqs::extractRegs() {
+  AddrRegs Regs(TRI);
+
+  // Registers that have been redefined since the beginning of the basic block
+  DenseSet<unsigned> Redefined;
+
+  // Handle the explicit derivations:
+  for (auto &D : Derivatives) {
+    // Note that this derivation overrides the corresponding implicit one:
+    Redefined.insert(D.first);
+    if (OrigRegs.pointsIntoStack(D.second))
+      Regs.insertStackReg(D.first);
+  }
+
+  // Handle the implicit derivations:
+  for (auto N : OrigRegs.Stack)
+    if (Redefined.find(N) == Redefined.end())
+      Regs.insertStackReg(N);
+
+  return Regs;
+}
+
+void AddrRegReqs::addPredecessor(AddrRegReqs *Predecessor) {
+  assert(Predecessors.insert(Predecessor).second);
+
+  // It is correct to copy the current value of UsedInMemOp and iterate through
+  // that in this method, even if this object is part of a cycle of predecessors
+  // and the useInMemOp routine adds a register to this->UsedInMemOp.  In that
+  // case, the useInMemOp routine will itself invoke Predecessor->useInMemOp
+  // with the newly-added register, since Predecessor was added to Predecessors
+  // prior to invoking useInMemOp below.
+  DenseSet<unsigned> OrigUsedInMemOp(UsedInMemOp);
+
+  for (auto N : OrigUsedInMemOp)
+    Predecessor->useInMemOp(N, OrigRegs.pointsIntoStack(N));
+}
+
+unsigned AddrRegReqs::lookupRoot(unsigned To) const {
+  if (!TRI->isPhysicalRegister(To))
+    return X86::NoRegister;
+
+  for (auto &Derivative : Derivatives)
+    if (TRI->regsOverlap(Derivative.first, To))
+      return Derivative.second;
+
+  return To;
+}
+
+void AddrRegReqs::derive(unsigned From, unsigned To) {
+  if (TRI->regsOverlap(To, TRI->getStackRegister()))
+    // The stack register is always assumed to point to the stack, regardless
+    // of what instructions are used to update it.
+    return;
+
+  // Lookup the root register from which From is derived, since the second
+  // identifier in each pair in Derivatives is actually for the register value
+  // at the beginning of the basic block from which the value for the register
+  // identified by the first number is derived.
+  unsigned RootFrom = lookupRoot(From);
+
+  for (auto &Derivative : Derivatives)
+    if (TRI->regsOverlap(Derivative.first, To)) {
+      // Update an existing entry
+      Derivative.second = RootFrom;
+      return;
+    }
+
+  // Replace the implicit derivation for To with the following explicit
+  // derivation.
+  Derivatives.push_back(std::make_pair(To, RootFrom));
+}
+
+void AddrRegReqs::markNotDerived(unsigned To) {
+  derive(X86::NoRegister, To);
+}
+
+void AddrRegReqs::useInMemOp(unsigned Reg, bool IsStackReg) {
+  bool FromStackReg = false;
+
+  unsigned RootFrom = lookupRoot(Reg);
+  if (RootFrom != X86::NoRegister)
+    FromStackReg = OrigRegs.pointsIntoStack(RootFrom);
+
+  // For registers that are set by instructions of the types used by the
+  // compiler to compute pointers to stack data, this pass checks that
+  // each such register either always points to stack data or never points
+  // to stack data at each instruction that uses the register in a memory
+  // operand.
+  assert(IsStackReg == FromStackReg &&
+         "One or more registers point to both stack and data locations at one "
+         "or more instructions, which is not supported by the "
+         "X86FixupSeparateStack pass.");
+
+  if (RootFrom == X86::NoRegister)
+    // This register cannot point to stack data, so it is unnecessary to perform
+    // consistency checks on its values.
+    return;
+
+  for (auto N : UsedInMemOp)
+    if (TRI->regsOverlap(RootFrom, N))
+      return;
+
+  UsedInMemOp.insert(RootFrom);
+
+  for (AddrRegReqs *P : Predecessors)
+    P->useInMemOp(RootFrom, IsStackReg);
+}
+
+/// \brief Return true iff Reg points to a stack location.
+bool AddrRegReqs::pointsIntoStack(unsigned Reg) const {
+  unsigned From = lookupRoot(Reg);
+  if (From == X86::NoRegister)
+    return false;
+
+  return OrigRegs.pointsIntoStack(From);
+}
+
+StackPtrSpillReqs::StackPtrSpillReqs(StackPtrSpillReqs *Predecessor) :
+    Top(false) {
+  Predecessors.insert(Predecessor);
+}
+
+class LockingSPToggle {
+private:
+  bool Set;
+  bool Value;
+
+public:
+  LockingSPToggle() : Set(false) {}
+
+  void set(bool V) {
+    if (Set) {
+      assert(V == Value &&
+             "Register fill would get a mixture of stack pointer and other "
+             "data");
+      return;
+    }
+
+    Value = V;
+    Set = true;
+  }
+
+  bool get() {
+    assert(Set);
+
+    return Value;
+  }
+};
+
+StackPtrSpillReqs::DemandResponse
+StackPtrSpillReqs::demand(int64_t Start, uint64_t Size,
+                          std::set<StackPtrSpillReqs *> Demanders) {
+  if (!Demanders.insert(this).second)
+    // A cycle was detected, and the demand is not satisfied within this cycle.
+    return DemandResponse::Unsatisfied;
+
+  LockingSPToggle SP;
+
+  for (uint64_t Offset = 0; Offset < Size; Offset++) {
+    auto SupplyI = Supply.find(Start + Offset);
+    bool Supplied = SupplyI != Supply.end();
+
+    if (Supplied) {
+      // This demand is satisfied by supply earlier in this function.
+      SP.set(SupplyI->second);
+      continue;
+    }
+
+    auto DemandI = Demand.find(Start + Offset);
+    bool Demanded = DemandI != Demand.end();
+
+    if (Demanded) {
+      // This demand was previously submitted and the result is cached.
+      SP.set(DemandI->second);
+      continue;
+    }
+
+    if (Top) {
+      // This assumes that arguments and uninitialized frame slots are not stack
+      // pointers
+      SP.set(false);
+      continue;
+    }
+
+    // Propagate this demand to the previously-visited predecessors of this
+    // function.
+    bool PredSP;
+    bool DemandSatisfied = false;
+    for (auto Pred : Predecessors) {
+      auto Resp = Pred->demand(Start + Offset, 1, Demanders);
+      if (Resp == DemandResponse::Unsatisfied)
+        // The demand was not satisfied by this cycle, but it may be satisfied
+        // by a different predecessor.
+        continue;
+
+      bool SPLocal = Resp == DemandResponse::SatisfiedSP;
+      if (!DemandSatisfied)
+        PredSP = SPLocal;
+      else
+        assert(PredSP == SPLocal ||
+               "Inconsistent register spills detected");
+      DemandSatisfied = true;
+    }
+    if (!DemandSatisfied)
+      return DemandResponse::Unsatisfied;
+
+    // Cache the result of propagating this demand to the function's predecessors.
+    Demand.emplace(Start + Offset, PredSP);
+    SP.set(PredSP);
+  }
+
+  return SP.get()? DemandResponse::SatisfiedSP : DemandResponse::SatisfiedNonSP;
+}
+
+bool StackPtrSpillReqs::isSPSpill(int64_t Start, uint64_t Size)
+{
+  std::set<StackPtrSpillReqs *> Demanders;
+  DemandResponse Resp = demand(Start, Size, Demanders);
+
+  assert(Resp != DemandResponse::Unsatisfied);
+
+  return Resp == DemandResponse::SatisfiedSP;
+}
+
+void StackPtrSpillReqs::addPredecessor(StackPtrSpillReqs *Predecessor) {
+  assert(Predecessors.find(Predecessor) == Predecessors.end());
+
+  std::set<StackPtrSpillReqs *> Demanders;
+
+  bool Satisfied = false;
+  for (auto D : Demand) {
+    auto Resp = Predecessor->demand(D.first, 1, Demanders);
+
+    // The Unsatisfied response can only be returned from a cycle that does not
+    // include the function entrypoint.  There will be at least one other
+    // predecessor that will be checked and that will lead back to the
+    // function entrypoint, because the basic block traversal starts at
+    // the function entrypoint.
+    if (Resp == DemandResponse::Unsatisfied)
+      continue;
+
+    assert(Resp ==
+        (D.second?
+            DemandResponse::SatisfiedSP :
+            DemandResponse::SatisfiedNonSP));
+
+    Satisfied = true;
+  }
+
+  assert(Satisfied);
+
+  Predecessors.insert(Predecessor);
+}
+
+void StackPtrSpillReqs::supply(bool SP, int64_t Start, uint64_t Size) {
+  for (uint64_t Offset = 0; Offset < Size; Offset++)
+    Supply.emplace(Start + Offset, SP);
+}
+
+bool X86FixupSeparateStack::processStringInstr(MachineBasicBlock *BB,
+                                               MachineBasicBlock::iterator &I,
+                                               AddrRegReqs &AReqs,
+                                               bool StringDS, bool StringES) {
+  MachineFunction &MF = *BB->getParent();
+
+  // FIXME: Check for a segment override prefix on the string instruction and
+  // skip the following if one is detected.
+
+  bool StackDS = false;
+  bool StackES = false;
+
+  if (StringDS) {
+    StackDS = AReqs.pointsIntoStack(X86::ESI);
+    AReqs.useInMemOp(X86::ESI, StackDS);
+  }
+
+  if (StringES) {
+    StackES = AReqs.pointsIntoStack(X86::EDI);
+    AReqs.useInMemOp(X86::EDI, StackES);
+  }
+
+  if (!(StackDS || StackES))
+    return false;
+
+  // Emit instructions to save a segment register, overwrite it with the
+  // contents of SS, and restore its original value after the string instruction
+  // is complete.
+  auto tempOverwriteSegWithSS = [&](unsigned PushOp, unsigned PopOp) {
+    // Save the original value of the segment register to be overwritten.
+    MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII->get(PushOp), I->getDebugLoc());
+    I = BB->insert(I, NewMI);
+    // Push the stack segment selector onto the stack.
+    NewMI = MF.CreateMachineInstr(TII->get(X86::PUSHSS32), I->getDebugLoc());
+    I = BB->insertAfter(I, NewMI);
+    // Overwrite the segment register with the stack segment selector.
+    NewMI = MF.CreateMachineInstr(TII->get(PopOp), I->getDebugLoc());
+    I = BB->insertAfter(I, NewMI);
+    // Advance to the string instruction.
+    I++;
+    // Restore the original value of the segment register.
+    NewMI = MF.CreateMachineInstr(TII->get(PopOp), I->getDebugLoc());
+    return BB->insertAfter(I, NewMI);
+  };
+
+  if (StackDS) {
+    tempOverwriteSegWithSS(X86::PUSHDS32, X86::POPDS32);
+  }
+  if (StackES) {
+    if (StackDS)
+      // Back up to the string instruction.
+      I--;
+    tempOverwriteSegWithSS(X86::PUSHES32, X86::POPES32);
+    if (StackDS)
+      // Advance past the POP instruction that restores DS.
+      I++;
+  }
+
+  return true;
+}
+
+bool X86FixupSeparateStack::processMemAccessInstr(MachineInstr &I,
+                                                  int MemoryOperand,
+                                                  AddrRegReqs &AReqs) {
+  MachineOperand &SegRegOp = I.getOperand(MemoryOperand + X86::AddrSegmentReg);
+
+  unsigned PrevSegReg = SegRegOp.getReg();
+
+  if (TRI->isPhysicalRegister(PrevSegReg))
+    // Do not replace the existing segment override prefix.
+    return false;
+
+  unsigned BaseReg = I.getOperand(MemoryOperand + X86::AddrBaseReg).getReg();
+
+  bool BaseIsPhysReg = TRI->isPhysicalRegister(BaseReg);
+
+  bool InStack = AReqs.pointsIntoStack(BaseReg);
+
+  if (BaseIsPhysReg)
+    AReqs.useInMemOp(BaseReg, InStack);
+
+  if (InStack &&
+       (TRI->regsOverlap(X86::ESP, BaseReg) ||
+        TRI->regsOverlap(X86::EBP, BaseReg)))
+      // Memory operand with a base register of ESP or EBP implicitly references
+      // SS.
+    return false;
+
+  if (!InStack &&
+      // Memory operand without a base register implicitly
+      // references DS.
+      (!(BaseIsPhysReg &&
+         // Memory operand with a base register other than ESP and EBP implicitly
+         // references DS.  This assumes that ESP is never used as a non-stack
+         // pointer.
+         TRI->regsOverlap(X86::EBP, BaseReg))))
+    return false;
+
+  SegRegOp.ChangeToRegister(InStack ? X86::SS : X86::DS, false);
+
+  return true;
+}
+
+unsigned X86FixupSeparateStack::mayComputeStackPtr(const MachineInstr &I) const {
+  switch (I.getOpcode()) {
+  // This assumes that only the following instructions are used by the compiler
+  // to compute a stack pointer derived directly or indirectly from ESP:
+  case X86::LEA32r:
+  case X86::MOV32rr:
+  case X86::ADD32ri8:
+  case X86::ADD32ri:
+    break;
+  default:
+    return X86::NoRegister;
+  }
+
+  for (const MachineOperand &Op : I.operands()) {
+    if (!Op.isReg())
+      continue;
+
+    if (!Op.isUse())
+      continue;
+
+    // Return the register that is used in the address computation.
+    return Op.getReg();
+  }
+
+  assert(false &&
+         "Unexpected failure to identify register used to compute address");
+}
+
+X86FixupSeparateStack::FrameOp
+X86FixupSeparateStack::checkForRegisterSpill(MachineInstr &I,
+                                             int MemoryOperand,
+                                             bool PossibleSP,
+                                             AddrRegReqs &AReqs,
+                                             StackPtrSpillReqs &SReqs) {
+  unsigned BaseReg = I.getOperand(MemoryOperand + X86::AddrBaseReg).getReg();
+
+  if (!TRI->isPhysicalRegister(BaseReg))
+    return NoFrameOp;
+
+  if (!TRI->regsOverlap(FrameReg, BaseReg))
+    return NoFrameOp;
+
+  const MachineOperand &Disp = I.getOperand(MemoryOperand + X86::AddrDisp);
+
+  // FIXME: Consider whether other forms of addressing should be handled.
+  if (!Disp.isImm())
+    return NoFrameOp;
+
+  bool SP = false;
+  if (PossibleSP) {
+    int RegOpNo = I.mayStore()? MemoryOperand + X86::AddrNumOperands : 0;
+    MachineOperand *RegOp = &(I.getOperand(RegOpNo));
+    SP = AReqs.pointsIntoStack(RegOp->getReg());
+  }
+
+  bool FoundMemOp = false;
+  uint64_t Size;
+  for (auto MO : I.memoperands()) {
+    assert(!FoundMemOp && "Only a single memory operand is supported");
+    FoundMemOp = true;
+    Size = MO->getSize();
+  }
+
+  if (!FoundMemOp)
+    return NoFrameOp;
+
+  if (I.mayStore()) {
+    SReqs.supply(SP, Disp.getImm(), Size);
+    return SpillReg;
+  }
+
+  if (SReqs.isSPSpill(Disp.getImm(), Size))
+    return FillRegWithStackPtr;
+
+  return NoFrameOp;
+}
+
+bool X86FixupSeparateStack::processBasicBlock(MachineBasicBlock *BB,
+                                              int64_t StackGrowth,
+                                              std::shared_ptr<AddrRegReqs> AReqs,
+                                              std::shared_ptr<StackPtrSpillReqs> SReqs) {
+  bool Changed = false;
+
+  Reqs.emplace(BB, std::make_pair(AReqs, SReqs));
+
+  // Last-computed register that points to a variadic argument.
+  unsigned VAPtr = X86::NoRegister;
+
+  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    bool StringDS, StringES;
+
+    unsigned Opcode = I->getOpcode();
+
+    switch (Opcode) {
+    case X86::REP_MOVSB_32:
+    case X86::REP_MOVSW_32:
+    case X86::REP_MOVSD_32:
+      StringDS = true;
+      StringES = true;
+      break;
+    case X86::REP_STOSB_32:
+    case X86::REP_STOSW_32:
+    case X86::REP_STOSD_32:
+      StringDS = false;
+      StringES = true;
+      break;
+    default:
+      StringDS = false;
+      StringES = false;
+      break;
+    }
+
+    if (StringDS || StringES) {
+      Changed |= processStringInstr(BB, I, *AReqs, StringDS, StringES);
+      continue;
+    }
+
+    const MCInstrDesc &Desc = TII->get(Opcode);
+    uint64_t TSFlags = Desc.TSFlags;
+
+    int MemoryOperand = -1;
+    if (I->mayLoadOrStore()) {
+      // Determine where the memory operand starts, if present.
+      MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+      if (MemoryOperand != -1)
+        MemoryOperand += X86II::getOperandBias(Desc);
+    }
+
+    bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+    bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+
+    unsigned MayDeriveStackPtrFrom = X86::NoRegister;
+
+    // This assumes that the compiler only uses the following forms of
+    // instructions to store stack pointers (including derived ones) to memory.
+    // Certain VEX-encoded instructions are specifically excluded to reduce the
+    // complexity of the subsequent code that computes the operand number of the
+    // register being stored.  See X86MCCodeEmitter::EmitVEXOpcodePrefix for
+    // information about how these instructions are represented in LLVM.  Again,
+    // this assumes that such instructions are not used to store stack pointers.
+    MachineOperand *StackPtrStoreSrcOp = nullptr;
+    if (Opcode == X86::PUSH32r) {
+      StackPtrStoreSrcOp = &(I->getOperand(0));
+    } else if (I->mayLoadOrStore() && !I->isCall() && MemoryOperand != -1) {
+      bool PossibleSP =
+          (((TSFlags & X86II::FormMask) == X86II::MRMDestMem) ||
+           ((TSFlags & X86II::FormMask) == X86II::MRMSrcMem)) &&
+          !(HasEVEX_K || HasVEX_4V);
+
+      FrameOp FrmOp = NoFrameOp;
+      if (FrameReg != StackReg)
+        FrmOp = checkForRegisterSpill(*I, MemoryOperand, PossibleSP, *AReqs, *SReqs);
+
+      switch (FrmOp) {
+      case FillRegWithStackPtr:
+        assert(PossibleSP);
+        MayDeriveStackPtrFrom = StackReg;
+        break;
+      case SpillReg:
+        break;
+      default:
+        if (PossibleSP && I->mayStore())
+          StackPtrStoreSrcOp =
+            &(I->getOperand(MemoryOperand + X86::AddrNumOperands));
+        break;
+      }
+    }
+
+    assert((StackPtrStoreSrcOp == nullptr ||
+            !AReqs->pointsIntoStack(StackPtrStoreSrcOp->getReg()) ||
+            I->readsRegister(VAPtr) ||
+            (SPStoreVerification == SPStoreVerif::None) ||
+            ((SPStoreVerification == SPStoreVerif::IgnoreVA) && HasVAStart)) &&
+           "The X86FixupSeparateStack pass is unable to track stack "
+           "pointers that are stored to memory.\n");
+
+    if (MemoryOperand != -1)
+      Changed |= processMemAccessInstr(*I, MemoryOperand, *AReqs);
+    else
+      MayDeriveStackPtrFrom = mayComputeStackPtr(*I);
+
+    for (MachineOperand &Op : I->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+
+      unsigned Reg = Op.getReg();
+
+      if (!TRI->isPhysicalRegister(Reg))
+        continue;
+
+      if (MayDeriveStackPtrFrom != X86::NoRegister) {
+        AReqs->derive(MayDeriveStackPtrFrom, Reg);
+        continue;
+      }
+
+      AReqs->markNotDerived(Reg);
+    }
+
+    if (SPStoreVerification == SPStoreVerif::None || !HasVAStart)
+      continue;
+
+    if (I->definesRegister(VAPtr))
+      VAPtr = X86::NoRegister;
+
+    // FIXME: Support more complex instructions for computing addresses of
+    // variadic arguments.
+    if (Opcode == X86::LEA32r &&
+        I->getOperand(2).getImm() == 1 &&
+        I->getOperand(3).getReg() == X86::NoRegister &&
+        I->getOperand(5).getReg() == X86::NoRegister &&
+        ((I->getOperand(1).getReg() == StackReg &&
+          I->getOperand(4).getImm() >= StackGrowth + VAStartFI * -4) ||
+         (StackReg != FrameReg &&
+          I->getOperand(1).getReg() == FrameReg &&
+          I->getOperand(4).getImm() >= VAStartFI * -4)))
+      // This assumes that this relative stack slot will always correspond to
+      // the first variadic argument in all invocations of this basic block,
+      // since it is only checked during the first invocation that is
+      // encountered by this pass.
+      VAPtr = I->getOperand(0).getReg();
+
+    if (!I->definesRegister(StackReg))
+      continue;
+
+    // This assumes that only the following instructions are used to update ESP.
+    switch (Opcode) {
+    case X86::ADD32ri:
+    case X86::ADD32ri8:
+      assert(I->getOperand(0).getReg() == StackReg &&
+             I->getOperand(1).getReg() == StackReg &&
+             "Unsupported instruction");
+      StackGrowth -= I->getOperand(2).getImm();
+      break;
+    case X86::SUB32ri:
+    case X86::SUB32ri8:
+      assert(I->getOperand(0).getReg() == StackReg &&
+             I->getOperand(1).getReg() == StackReg &&
+             "Unsupported instruction");
+      StackGrowth += I->getOperand(2).getImm();
+      break;
+    case X86::LEA32r:
+      assert(I->getOperand(2).getImm() == 1 &&
+             I->getOperand(3).getReg() == X86::NoRegister &&
+             I->getOperand(5).getReg() == X86::NoRegister &&
+             "Unsupported instruction");
+      StackGrowth -= I->getOperand(4).getImm();
+      break;
+    case X86::PUSHi32:
+    case X86::PUSH32i8:
+    case X86::PUSH32r:
+    case X86::PUSH32rmm:
+      StackGrowth += 4;
+      break;
+    case X86::POP32r:
+    case X86::POP32rmm:
+      StackGrowth -= 4;
+      break;
+    }
+
+    assert(0 <= StackGrowth && "Error occurred while computing stack size");
+  }
+
+  for (auto Succ : BB->successors()) {
+    auto RI = Reqs.find(Succ);
+    if (RI == Reqs.end()) {
+      std::shared_ptr<AddrRegReqs> SuccAReqs(new AddrRegReqs(AReqs.get()));
+      std::shared_ptr<StackPtrSpillReqs> SuccSReqs(
+          new StackPtrSpillReqs(SReqs.get()));
+      Changed |=processBasicBlock(Succ, StackGrowth, SuccAReqs, SuccSReqs);
+    } else {
+      RI->second.first->addPredecessor(AReqs.get());
+      RI->second.second->addPredecessor(SReqs.get());
+    }
+  }
+
+  return Changed;
+}
+
+bool X86FixupSeparateStack::runOnMachineFunction(MachineFunction &MF) {
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  FrameReg = TRI->getFrameRegister(MF);
+  StackReg = TRI->getStackRegister();
+
+  if (!STI->useSeparateStackSeg())
+    return false;
+
+  assert(!STI->is64Bit() && "Only X86-32 is supported.");
+
+  HasVAStart = MF.getFrameInfo()->hasVAStart();
+  if (HasVAStart)
+    VAStartFI = MF.getInfo<X86MachineFunctionInfo>()->getVarArgsFrameIndex();
+
+  Reqs.clear();
+
+  std::shared_ptr<AddrRegReqs> InitAReqs(new AddrRegReqs(TRI));
+  std::shared_ptr<StackPtrSpillReqs> InitSReqs(new StackPtrSpillReqs());
+  return processBasicBlock(&MF.front(), 0, InitAReqs, InitSReqs);
+}
Index: lib/Target/X86/X86Subtarget.h
===================================================================
--- lib/Target/X86/X86Subtarget.h
+++ lib/Target/X86/X86Subtarget.h
@@ -269,6 +269,9 @@
   /// Use software floating point for code generation.
   bool UseSoftFloat;
 
+  /// Use a separate stack segment
+  bool UseSeparateStackSeg;
+
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -448,6 +451,7 @@
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
   bool useSoftFloat() const { return UseSoftFloat; }
+  bool useSeparateStackSeg() const { return UseSeparateStackSeg; }
 
   /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   /// no-sse2). There isn't any reason to disable it if the target processor
Index: lib/Target/X86/X86Subtarget.cpp
===================================================================
--- lib/Target/X86/X86Subtarget.cpp
+++ lib/Target/X86/X86Subtarget.cpp
@@ -333,6 +333,7 @@
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
   UseSoftFloat = false;
+  UseSeparateStackSeg = false;
 }
 
 X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -284,4 +284,6 @@
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
   }
+
+  addPass(createX86FixupSeparateStack());
 }