Index: lib/CodeGen/StackColoring.cpp =================================================================== --- lib/CodeGen/StackColoring.cpp +++ lib/CodeGen/StackColoring.cpp @@ -252,49 +252,54 @@ /// Each bit in the BitVector represents the liveness property /// for a different stack slot. struct BlockLifetimeInfo { - /// Which slots BEGINs in each basic block. + /// Which slots BEGIN in this block and survive to its end. BitVector Begin; - /// Which slots ENDs in each basic block. + /// Which slots BEGIN and END in this block. + BitVector Use; + /// Which slots END in this block. BitVector End; - /// Which slots are marked as LIVE_IN, coming into each basic block. + /// Which slots are marked as LIVE_IN, coming into this block. BitVector LiveIn; - /// Which slots are marked as LIVE_OUT, coming out of each basic block. + /// Which slots are marked as LIVE_OUT, coming out of this block. BitVector LiveOut; }; /// Maps active slots (per bit) for each basic block. typedef DenseMap LivenessMap; LivenessMap BlockLiveness; /// Maps serial numbers to basic blocks. DenseMap BasicBlocks; /// Maps basic blocks to a serial number. SmallVector BasicBlockNumbering; - /// Maps liveness intervals for each slot. + /// Maps slots to their activity interval. Outside of this interval, slots + /// values are either dead or `undef` and they will not be written to. SmallVector, 16> Intervals; + /// Maps slots to the set of gen-points of their intervals. + SmallVector, 16> IntervalStarts; /// VNInfo is used for the construction of LiveIntervals. VNInfo::Allocator VNInfoAllocator; /// SlotIndex analysis object. SlotIndexes *Indexes; /// The stack protector object. StackProtector *SP; /// The list of lifetime markers found. These markers are to be removed /// once the coloring is done. SmallVector Markers; /// Record the FI slots for which we have seen some sort of /// lifetime marker (either start or end). BitVector InterestingSlots; /// FI slots that need to be handled conservatively (for these /// slots lifetime-start-on-first-use is disabled). BitVector ConservativeSlots; /// Number of iterations taken during data flow analysis. unsigned NumIterations; public: static char ID; StackColoring() : MachineFunctionPass(ID) { @@ -399,11 +404,12 @@ const BlockLifetimeInfo &BlockInfo = BI->second; dumpBV("BEGIN", BlockInfo.Begin); + dumpBV("USE", BlockInfo.Use); dumpBV("END", BlockInfo.End); dumpBV("LIVE_IN", BlockInfo.LiveIn); dumpBV("LIVE_OUT", BlockInfo.LiveOut); } LLVM_DUMP_METHOD void StackColoring::dump() const { for (MachineBasicBlock *MBB : depth_first(MF)) { dbgs() << "Inspecting block #" << MBB->getNumber() << " [" @@ -416,10 +422,12 @@ for (unsigned I = 0, E = Intervals.size(); I != E; ++I) { dbgs() << "Interval[" << I << "]:\n"; Intervals[I]->dump(); + dbgs() << "IntervalStarts[" << I << "]:\n"; + IntervalStarts[I]->dump(); } } #endif static inline int getStartOrEndSlot(const MachineInstr &MI) { assert((MI.getOpcode() == TargetOpcode::LIFETIME_START || @@ -579,8 +587,9 @@ BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB]; BlockInfo.Begin.resize(NumSlot); + BlockInfo.Use.resize(NumSlot); BlockInfo.End.resize(NumSlot); SmallVector slots; for (MachineInstr &MI : *MBB) { bool isStart = false; @@ -591,6 +600,7 @@ int Slot = slots[0]; if (BlockInfo.Begin.test(Slot)) { BlockInfo.Begin.reset(Slot); + BlockInfo.Use.set(Slot); } BlockInfo.End.set(Slot); } else { @@ -606,18 +616,21 @@ if (BlockInfo.End.test(Slot)) { BlockInfo.End.reset(Slot); } + if (BlockInfo.Use.test(Slot)) { + BlockInfo.Use.reset(Slot); + } BlockInfo.Begin.set(Slot); } } } } } // Update statistics. NumMarkerSeen += MarkersFound; return MarkersFound; } void StackColoring::calculateLocalLiveness() { unsigned NumIters = 0; @@ -741,11 +754,16 @@ assert(Starts[i] && Finishes[i] && "Invalid interval"); VNInfo *ValNum = Intervals[i]->getValNumInfo(0); + VNInfo *ValNumS = IntervalStarts[i]->getValNumInfo(0); SlotIndex S = Starts[i]; SlotIndex F = Finishes[i]; if (S < F) { // We have a single consecutive region. Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum)); + // FIXME: stop cargo culting + if (MBBLiveness.Begin.test(i) || MBBLiveness.Use.test(i)) { + IntervalStarts[i]->addSegment(LiveInterval::Segment(S, F, ValNumS)); + } } else { // We have two non-consecutive regions. This happens when // LIFETIME_START appears after the LIFETIME_END marker. @@ -753,11 +771,16 @@ SlotIndex NewFin = Indexes->getMBBEndIdx(&MBB); Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum)); Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum)); + // FIXME: stop cargo culting + if (MBBLiveness.Begin.test(i) || MBBLiveness.Use.test(i)) { + IntervalStarts[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNumS)); + IntervalStarts[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNumS)); + } } } } } bool StackColoring::removeAllMarkers() { unsigned Count = 0; for (MachineInstr *MI : Markers) { @@ -983,31 +1006,32 @@ BasicBlockNumbering.clear(); Markers.clear(); Intervals.clear(); + IntervalStarts.clear(); VNInfoAllocator.Reset(); unsigned NumSlots = MFI->getObjectIndexEnd(); // If there are no stack slots then there are no markers to remove. if (!NumSlots) return false; SmallVector SortedSlots; SortedSlots.reserve(NumSlots); Intervals.reserve(NumSlots); unsigned NumMarkers = collectMarkers(NumSlots); unsigned TotalSize = 0; DEBUG(dbgs()<<"Found "<getObjectIndexEnd(); ++i) { DEBUG(dbgs()<<"Slot #"<getObjectSize(i)<<" bytes.\n"); TotalSize += MFI->getObjectSize(i); } DEBUG(dbgs()<<"Total Stack size: "< LI(new LiveInterval(i, 0)); LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator); Intervals.push_back(std::move(LI)); + + // Just cargo culting. Please help me DTRT. + std::unique_ptr SI(new LiveInterval(i, 0)); + SI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator); + IntervalStarts.push_back(std::move(SI)); + SortedSlots.push_back(i); } // Calculate the liveness of each block. calculateLocalLiveness(); DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n"); DEBUG(dump()); // Propagate the liveness information. calculateLiveIntervals(NumSlots); DEBUG(dumpIntervals()); // Search for allocas which are used outside of the declared lifetime // markers. if (ProtectFromEscapedAllocas) removeInvalidSlotRanges(); // Maps old slots to new slots. DenseMap SlotRemap; unsigned RemovedSlots = 0; unsigned ReducedSize = 0; // Do not bother looking at empty intervals. for (unsigned I = 0; I < NumSlots; ++I) { if (Intervals[SortedSlots[I]]->empty()) SortedSlots[I] = -1; } // This is a simple greedy algorithm for merging allocas. First, sort the // slots, placing the largest slots first. Next, perform an n^2 scan and look // for disjoint slots. When you find disjoint slots, merge the samller one // into the bigger one and update the live interval. Remove the small alloca // and continue. // Sort the slots according to their size. Place unused slots at the end. // Use stable sort to guarantee deterministic code generation. std::stable_sort(SortedSlots.begin(), SortedSlots.end(), @@ -1079,24 +1109,82 @@ int FirstSlot = SortedSlots[I]; int SecondSlot = SortedSlots[J]; LiveInterval *First = &*Intervals[FirstSlot]; + LiveInterval *FirstS = &*IntervalStarts[FirstSlot]; LiveInterval *Second = &*Intervals[SecondSlot]; + LiveInterval *SecondS = &*IntervalStarts[SecondSlot]; assert (!First->empty() && !Second->empty() && "Found an empty range"); - // Merge disjoint slots. - if (!First->overlaps(*Second)) { + // Merge disjoint slots. Now, the condition for this is a little bit + // tricky. + // + // The fundamental condition we want to preserve is that each stack + // slot has the correct contents at each point it is live. + // + // We *could* compute liveness using the standard backward dataflow + // algorithm. Unfortunately, that does not give very good results in the + // presence of aliasing, so we have frontends emit `lifetime.start` and + // `lifetime.end` intrinsics that make undesirable accesses UB. + // + // The effect of these intrinsics is as follows: + // 1) at start, each stack-slot is marked as *out-of-scope*, unless no + // lifetime intrinsic refers to that stack slot, in which case + // it is marked as *in-scope*. + // 2) on a `lifetime.start`, a stack slot is marked as *in-scope* and + // the stack slot is overwritten with `undef`. + // 3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*. + // 4) on function exit, all stack slots are marked as *out-of-scope*. + // 5) the effects of calling `lifetime.start` on an *in-scope* stack-slot, + // or `lifetime.end` on an *out-of-scope* stack-slot, are left unspecified. + // 6) memory accesses to *out-of-scope* stack slots are UB. + // 7) when a stack-slot is marked as *out-of-scope*, all pointers to it + // are invalidated unless it looks like they might be used (?). This + // is used to justify not marking slots as live until the pointer + // to them is used, but I think this should be clarified better. + // + // If we define a slot as *active* at a program point if it either can + // be written to, or if it has a live and non-undef content, then it + // is obvious that slots that are never active together can be merged. + // + // From our rules, we see that *out-of-scope* slots are never *active*, + // and from (7) we see that "non-conservative" slots remain non-*active* + // until their address is taken. Therefore, we can approximate slot activity + // using dataflow. + // + // Now, naively, we might think that we could construct our interference + // graph by propagating `S active` through the CFG for every stack-slot `S`, + // and having `S` and `T` interfere if there is a point in which they are + // both *active*. That is sound, but overly conservative in some important + // cases: it is possible that `S` is active on one predecessor edge and + // `T` is active on another. See PR32488. + // + // If we want to construct the interference graph precisely, we could + // propagate `S active` and `S&T active` predicates through the CFG. That + // would be precise, but requires propagating `O(n^2)` dataflow facts. + // + // Instead, we rely on a little trick: for an `S&T active` predicate to + // start holding, there has to be either + // A) a point in the gen-set of `S active` where `T` is *active* + // B) a point in the gen-set of `T active` where `S` is *active* + // C) a point in the gen-set of both `S active` and `T active`. + // + // Of course, the `S&T active` predicate can be propagated further, but + // it holding at 1 point is enough for us to mark an edge on the interference + // graph. So that's what we do. + if (!First->overlaps(*SecondS) && !FirstS->overlaps(*Second)) { Changed = true; First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0)); + FirstS->MergeSegmentsInAsValue(*SecondS, FirstS->getValNumInfo(0)); SlotRemap[SecondSlot] = FirstSlot; SortedSlots[J] = -1; DEBUG(dbgs()<<"Merging #"<getObjectAlignment(FirstSlot), MFI->getObjectAlignment(SecondSlot)); assert(MFI->getObjectSize(FirstSlot) >= MFI->getObjectSize(SecondSlot) && "Merging a small object into a larger one"); RemovedSlots+=1; ReducedSize += MFI->getObjectSize(SecondSlot); MFI->setObjectAlignment(FirstSlot, MaxAlignment); Index: test/CodeGen/X86/StackColoring.ll =================================================================== --- test/CodeGen/X86/StackColoring.ll +++ test/CodeGen/X86/StackColoring.ll @@ -582,14 +582,56 @@ ret i32 %x.addr.0 } + +;CHECK-LABEL: pr32488: +;YESCOLOR: subq $256, %rsp +;NOFIRSTUSE: subq $256, %rsp +;NOCOLOR: subq $512, %rsp +define i1 @pr32488(i1, i1) +{ +entry-block: + %foo = alloca [32 x i64] + %bar = alloca [32 x i64] + %foo_i8 = bitcast [32 x i64]* %foo to i8* + %bar_i8 = bitcast [32 x i64]* %bar to i8* + br i1 %0, label %if_false, label %if_true +if_false: + call void @llvm.lifetime.start(i64 256, i8* %bar_i8) + call void @baz([32 x i64]* %foo, i32 0) + br i1 %1, label %if_false.1, label %onerr +if_false.1: + call void @llvm.lifetime.end(i64 256, i8* %bar_i8) + br label %merge +if_true: + call void @llvm.lifetime.start(i64 256, i8* %foo_i8) + call void @baz([32 x i64]* %foo, i32 1) + br i1 %1, label %if_true.1, label %onerr +if_true.1: + call void @llvm.lifetime.end(i64 256, i8* %foo_i8) + br label %merge +merge: + ret i1 false +onerr: + call void @llvm.lifetime.end(i64 256, i8* %foo_i8) + call void @llvm.lifetime.end(i64 256, i8* %bar_i8) + call void @destructor() + ret i1 true +} + +%Data = type { [32 x i64] } + +declare void @destructor() + declare void @inita(i32*) declare void @initb(i32*,i32*,i32*) declare void @bar([100 x i32]* , [100 x i32]*) nounwind +declare void @baz([32 x i64]*, i32) + declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare i32 @foo(i32, i8*)