Index: lib/CodeGen/StackColoring.cpp
===================================================================
--- lib/CodeGen/StackColoring.cpp
+++ lib/CodeGen/StackColoring.cpp
@@ -252,49 +252,54 @@
   /// Each bit in the BitVector represents the liveness property
   /// for a different stack slot.
   struct BlockLifetimeInfo {
-    /// Which slots BEGINs in each basic block.
+    /// Which slots BEGIN in this block and survive to its end.
     BitVector Begin;
-    /// Which slots ENDs in each basic block.
+    /// Which slots BEGIN and END in this block.
+    BitVector Use;
+    /// Which slots END in this block.
     BitVector End;
-    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    /// Which slots are marked as LIVE_IN, coming into this block.
     BitVector LiveIn;
-    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    /// Which slots are marked as LIVE_OUT, coming out of this block.
     BitVector LiveOut;
   };

   /// Maps active slots (per bit) for each basic block.
   typedef DenseMap<const MachineBasicBlock*, BlockLifetimeInfo> LivenessMap;
   LivenessMap BlockLiveness;

   /// Maps serial numbers to basic blocks.
   DenseMap<const MachineBasicBlock*, int> BasicBlocks;
   /// Maps basic blocks to a serial number.
   SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;

-  /// Maps liveness intervals for each slot.
+  /// Maps slots to their activity interval. Outside of this interval, slots
+  /// values are either dead or `undef` and they will not be written to.
   SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
+  /// Maps slots to the set of gen-points of their intervals.
+  SmallVector<std::unique_ptr<LiveInterval>, 16> IntervalStarts;
   /// VNInfo is used for the construction of LiveIntervals.
   VNInfo::Allocator VNInfoAllocator;
   /// SlotIndex analysis object.
   SlotIndexes *Indexes;
   /// The stack protector object.
   StackProtector *SP;

   /// The list of lifetime markers found. These markers are to be removed
   /// once the coloring is done.
   SmallVector<MachineInstr*, 8> Markers;

   /// Record the FI slots for which we have seen some sort of
   /// lifetime marker (either start or end).
   BitVector InterestingSlots;

   /// FI slots that need to be handled conservatively (for these
   /// slots lifetime-start-on-first-use is disabled).
   BitVector ConservativeSlots;

   /// Number of iterations taken during data flow analysis.
   unsigned NumIterations;

 public:
   static char ID;
   StackColoring() : MachineFunctionPass(ID) {
@@ -399,11 +404,12 @@
   const BlockLifetimeInfo &BlockInfo = BI->second;

   dumpBV("BEGIN", BlockInfo.Begin);
+  dumpBV("USE", BlockInfo.Use);
   dumpBV("END", BlockInfo.End);
   dumpBV("LIVE_IN", BlockInfo.LiveIn);
   dumpBV("LIVE_OUT", BlockInfo.LiveOut);
 }

 LLVM_DUMP_METHOD void StackColoring::dump() const {
   for (MachineBasicBlock *MBB : depth_first(MF)) {
     dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
@@ -416,10 +422,12 @@
   for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
     dbgs() << "Interval[" << I << "]:\n";
     Intervals[I]->dump();
+    dbgs() << "IntervalStarts[" << I << "]:\n";
+    IntervalStarts[I]->dump();
   }
 }
 #endif

 static inline int getStartOrEndSlot(const MachineInstr &MI)
 {
   assert((MI.getOpcode() == TargetOpcode::LIFETIME_START ||
@@ -579,8 +587,9 @@
     BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB];

     BlockInfo.Begin.resize(NumSlot);
+    BlockInfo.Use.resize(NumSlot);
     BlockInfo.End.resize(NumSlot);

     SmallVector<int, 4> slots;
     for (MachineInstr &MI : *MBB) {
       bool isStart = false;
@@ -591,6 +600,7 @@
           int Slot = slots[0];
           if (BlockInfo.Begin.test(Slot)) {
             BlockInfo.Begin.reset(Slot);
+            BlockInfo.Use.set(Slot);
           }
           BlockInfo.End.set(Slot);
         } else {
@@ -606,18 +616,21 @@
             if (BlockInfo.End.test(Slot)) {
               BlockInfo.End.reset(Slot);
             }
+            if (BlockInfo.Use.test(Slot)) {
+              BlockInfo.Use.reset(Slot);
+            }
             BlockInfo.Begin.set(Slot);
           }
         }
       }
     }
   }

   // Update statistics.
   NumMarkerSeen += MarkersFound;
   return MarkersFound;
 }

 void StackColoring::calculateLocalLiveness()
 {
   unsigned NumIters = 0;
@@ -741,11 +754,16 @@

       assert(Starts[i] && Finishes[i] && "Invalid interval");
       VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
+      VNInfo *ValNumS = IntervalStarts[i]->getValNumInfo(0);
       SlotIndex S = Starts[i];
       SlotIndex F = Finishes[i];
       if (S < F) {
         // We have a single consecutive region.
         Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
+        // FIXME: stop cargo culting
+        if (MBBLiveness.Begin.test(i) || MBBLiveness.Use.test(i)) {
+          IntervalStarts[i]->addSegment(LiveInterval::Segment(S, F, ValNumS));
+        }
       } else {
         // We have two non-consecutive regions. This happens when
         // LIFETIME_START appears after the LIFETIME_END marker.
@@ -753,11 +771,16 @@
         SlotIndex NewFin = Indexes->getMBBEndIdx(&MBB);
         Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
         Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
+        // FIXME: stop cargo culting
+        if (MBBLiveness.Begin.test(i) || MBBLiveness.Use.test(i)) {
+          IntervalStarts[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNumS));
+          IntervalStarts[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNumS));
+        }
       }
     }
   }
 }

 bool StackColoring::removeAllMarkers() {
   unsigned Count = 0;
   for (MachineInstr *MI : Markers) {
@@ -983,31 +1006,32 @@
   BasicBlockNumbering.clear();
   Markers.clear();
   Intervals.clear();
+  IntervalStarts.clear();
   VNInfoAllocator.Reset();

   unsigned NumSlots = MFI->getObjectIndexEnd();

   // If there are no stack slots then there are no markers to remove.
   if (!NumSlots)
     return false;

   SmallVector<int, 8> SortedSlots;
   SortedSlots.reserve(NumSlots);
   Intervals.reserve(NumSlots);

   unsigned NumMarkers = collectMarkers(NumSlots);

   unsigned TotalSize = 0;
   DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
   DEBUG(dbgs()<<"Slot structure:\n");

   for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
     DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
     TotalSize += MFI->getObjectSize(i);
   }

   DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");

   // Don't continue because there are not enough lifetime markers, or the
   // stack is too small, or we are told not to optimize the slots.
   if (NumMarkers < 2 || TotalSize < 16 || DisableColoring ||
@@ -1020,40 +1044,46 @@
     std::unique_ptr<LiveInterval> LI(new LiveInterval(i, 0));
     LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
     Intervals.push_back(std::move(LI));
+
+    // Just cargo culting. Please help me DTRT.
+    std::unique_ptr<LiveInterval> SI(new LiveInterval(i, 0));
+    SI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
+    IntervalStarts.push_back(std::move(SI));
+
     SortedSlots.push_back(i);
   }

   // Calculate the liveness of each block.
   calculateLocalLiveness();
   DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
   DEBUG(dump());

   // Propagate the liveness information.
   calculateLiveIntervals(NumSlots);
   DEBUG(dumpIntervals());

   // Search for allocas which are used outside of the declared lifetime
   // markers.
   if (ProtectFromEscapedAllocas)
     removeInvalidSlotRanges();

   // Maps old slots to new slots.
   DenseMap<int, int> SlotRemap;
   unsigned RemovedSlots = 0;
   unsigned ReducedSize = 0;

   // Do not bother looking at empty intervals.
   for (unsigned I = 0; I < NumSlots; ++I) {
     if (Intervals[SortedSlots[I]]->empty())
       SortedSlots[I] = -1;
   }

   // This is a simple greedy algorithm for merging allocas. First, sort the
   // slots, placing the largest slots first. Next, perform an n^2 scan and look
   // for disjoint slots. When you find disjoint slots, merge the samller one
   // into the bigger one and update the live interval. Remove the small alloca
   // and continue.

   // Sort the slots according to their size. Place unused slots at the end.
   // Use stable sort to guarantee deterministic code generation.
   std::stable_sort(SortedSlots.begin(), SortedSlots.end(),
@@ -1079,24 +1109,82 @@
         int FirstSlot = SortedSlots[I];
         int SecondSlot = SortedSlots[J];
         LiveInterval *First = &*Intervals[FirstSlot];
+        LiveInterval *FirstS = &*IntervalStarts[FirstSlot];
         LiveInterval *Second = &*Intervals[SecondSlot];
+        LiveInterval *SecondS = &*IntervalStarts[SecondSlot];
         assert (!First->empty() && !Second->empty() && "Found an empty range");

-        // Merge disjoint slots.
-        if (!First->overlaps(*Second)) {
+        // Merge disjoint slots. Now, the condition for this is a little bit
+	// tricky.
+        //
+        // The fundamental condition we want to preserve is that each stack
+        // slot has the correct contents at each point it is live.
+        //
+        // We *could* compute liveness using the standard backward dataflow
+        // algorithm. Unfortunately, that does not give very good results in the
+        // presence of aliasing, so we have frontends emit `lifetime.start` and
+        // `lifetime.end` intrinsics that make undesirable accesses UB.
+        //
+        // The effect of these intrinsics is as follows:
+        //   1) at start, each stack-slot is marked as *out-of-scope*, unless no
+        //   lifetime intrinsic refers to that stack slot, in which case
+        //   it is marked as *in-scope*.
+        //   2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
+        //   the stack slot is overwritten with `undef`.
+        //   3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
+        //   4) on function exit, all stack slots are marked as *out-of-scope*.
+        //   5) the effects of calling `lifetime.start` on an *in-scope* stack-slot,
+        //   or `lifetime.end` on an *out-of-scope* stack-slot, are left unspecified.
+        //   6) memory accesses to *out-of-scope* stack slots are UB.
+        //   7) when a stack-slot is marked as *out-of-scope*, all pointers to it
+        //   are invalidated unless it looks like they might be used (?). This
+        //   is used to justify not marking slots as live until the pointer
+        //   to them is used, but I think this should be clarified better.
+	//
+	// If we define a slot as *active* at a program point if it either can
+	// be written to, or if it has a live and non-undef content, then it
+	// is obvious that slots that are never active together can be merged.
+	//
+        // From our rules, we see that *out-of-scope* slots are never *active*,
+        // and from (7) we see that "non-conservative" slots remain non-*active*
+        // until their address is taken. Therefore, we can approximate slot activity
+        // using dataflow.
+        //
+        // Now, naively, we might think that we could construct our interference
+	// graph by propagating `S active` through the CFG for every stack-slot `S`,
+	// and having `S` and `T` interfere if there is a point in which they are
+	// both *active*. That is sound, but overly conservative in some important
+	// cases: it is possible that `S` is active on one predecessor edge and
+	// `T` is active on another. See PR32488.
+	//
+	// If we want to construct the interference graph precisely, we could
+        // propagate `S active` and `S&T active` predicates through the CFG. That
+        // would be precise, but requires propagating `O(n^2)` dataflow facts.
+        //
+        // Instead, we rely on a little trick: for an `S&T active` predicate to
+        // start holding, there has to be either
+        // A) a point in the gen-set of `S active` where `T` is *active*
+        // B) a point in the gen-set of `T active` where `S` is *active*
+        // C) a point in the gen-set of both `S active` and `T active`.
+        //
+        // Of course, the `S&T active` predicate can be propagated further, but
+        // it holding at 1 point is enough for us to mark an edge on the interference
+        // graph. So that's what we do.
+        if (!First->overlaps(*SecondS) && !FirstS->overlaps(*Second)) {
           Changed = true;
           First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
+          FirstS->MergeSegmentsInAsValue(*SecondS, FirstS->getValNumInfo(0));
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
           DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
                 SecondSlot<<" together.\n");
           unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
                                            MFI->getObjectAlignment(SecondSlot));

           assert(MFI->getObjectSize(FirstSlot) >=
                  MFI->getObjectSize(SecondSlot) &&
                  "Merging a small object into a larger one");

           RemovedSlots+=1;
           ReducedSize += MFI->getObjectSize(SecondSlot);
           MFI->setObjectAlignment(FirstSlot, MaxAlignment);
Index: test/CodeGen/X86/StackColoring.ll
===================================================================
--- test/CodeGen/X86/StackColoring.ll
+++ test/CodeGen/X86/StackColoring.ll
@@ -582,14 +582,56 @@
   ret i32 %x.addr.0
 }

+
+;CHECK-LABEL: pr32488:
+;YESCOLOR: subq  $256, %rsp
+;NOFIRSTUSE: subq  $256, %rsp
+;NOCOLOR: subq  $512, %rsp
+define i1 @pr32488(i1, i1)
+{
+entry-block:
+  %foo = alloca [32 x i64]
+  %bar = alloca [32 x i64]
+  %foo_i8 = bitcast [32 x i64]* %foo to i8*
+  %bar_i8 = bitcast [32 x i64]* %bar to i8*
+  br i1 %0, label %if_false, label %if_true
+if_false:
+  call void @llvm.lifetime.start(i64 256, i8* %bar_i8)
+  call void @baz([32 x i64]* %foo, i32 0)
+  br i1 %1, label %if_false.1, label %onerr
+if_false.1:
+  call void @llvm.lifetime.end(i64 256, i8* %bar_i8)
+  br label %merge
+if_true:
+  call void @llvm.lifetime.start(i64 256, i8* %foo_i8)
+  call void @baz([32 x i64]* %foo, i32 1)
+  br i1 %1, label %if_true.1, label %onerr
+if_true.1:
+  call void @llvm.lifetime.end(i64 256, i8* %foo_i8)
+  br label %merge
+merge:
+  ret i1 false
+onerr:
+  call void @llvm.lifetime.end(i64 256, i8* %foo_i8)
+  call void @llvm.lifetime.end(i64 256, i8* %bar_i8)
+  call void @destructor()
+  ret i1 true
+}
+
+%Data = type { [32 x i64] }
+
+declare void @destructor()
+
 declare void @inita(i32*)

 declare void @initb(i32*,i32*,i32*)

 declare void @bar([100 x i32]* , [100 x i32]*) nounwind

+declare void @baz([32 x i64]*, i32)
+
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind

 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind

 declare i32 @foo(i32, i8*)