Index: include/llvm/Transforms/IPO/LowerBitSets.h =================================================================== --- include/llvm/Transforms/IPO/LowerBitSets.h +++ include/llvm/Transforms/IPO/LowerBitSets.h @@ -20,6 +20,7 @@ #include #include +#include #include namespace llvm { @@ -73,6 +74,69 @@ BitSetInfo build(); }; +/// This class implements a layout algorithm for globals referenced by bit sets +/// that tries to keep members of small bit sets together. This can +/// significantly reduce bit set sizes in many cases. +/// +/// It works by assembling fragments of layout from sets of referenced globals. +/// Each set of referenced globals causes the algorithm to create a new +/// fragment, which is assembled by appending each referenced global in the set +/// into the fragment. If a referenced global has already been referenced by an +/// fragment created earlier, we instead delete that fragment and append its +/// contents into the fragment we are assembling. +/// +/// By starting with the smallest fragments, we minimize the size of the +/// fragments that are copied into larger fragments. This is most intuitively +/// thought about when considering the case where the globals are virtual tables +/// and the bit sets represent their derived classes: in a single inheritance +/// hierarchy, the optimum layout would involve a depth-first search of the +/// class hierarchy (and in fact the computed layout ends up looking a lot like +/// a DFS), but a naive DFS would not work well in the presence of multiple +/// inheritance. This aspect of the algorithm ends up fitting smaller +/// hierarchies inside larger ones where that would be beneficial. +/// +/// For example, consider this class hierarchy: +/// +/// A B +/// \ / | \ +/// C D E +/// +/// We have five bit sets: bsA (A, C), bsB (B, C, D, E), bsC (C), bsD (D) and +/// bsE (E). If we laid out our objects by DFS traversing B followed by A, our +/// layout would be {B, C, D, E, A}. This is optimal for bsB as it needs to +/// cover the only 4 objects in its hierarchy, but not for bsA as it needs to +/// cover 5 objects, i.e. the entire layout. Our algorithm proceeds as follows: +/// +/// Add bsC, fragments {{C}} +/// Add bsD, fragments {{C}, {D}} +/// Add bsE, fragments {{C}, {D}, {E}} +/// Add bsA, fragments {{A, C}, {D}, {E}} +/// Add bsB, fragments {{B, A, C, D, E}} +/// +/// This layout is optimal for bsA, as it now only needs to cover two (i.e. 3 +/// fewer) objects, at the cost of bsB needing to cover 1 more object. +/// +/// The bit set lowering pass assigns an object index to each object that needs +/// to be laid out, and calls addFragment for each bit set passing the object +/// indices of its referenced globals. It then assembles a layout from the +/// computed layout in the Fragments field. +struct GlobalLayoutBuilder { + /// The computed layout. Each element of this vector contains a fragment of + /// layout (which may be empty) consisting of object indices. + std::vector> Fragments; + + /// Mapping from object index to fragment index. + std::vector FragmentMap; + + GlobalLayoutBuilder(uint64_t NumObjects) + : Fragments(1), FragmentMap(NumObjects) {} + + /// Add \param F to the layout while trying to keep its indices contiguous. + /// If a previously seen fragment uses any of \param F's indices, that + /// fragment will be laid out inside \param F. + void addFragment(const std::set &F); +}; + } // namespace llvm #endif Index: lib/Transforms/IPO/LowerBitSets.cpp =================================================================== --- lib/Transforms/IPO/LowerBitSets.cpp +++ lib/Transforms/IPO/LowerBitSets.cpp @@ -118,6 +118,35 @@ return BSI; } +void GlobalLayoutBuilder::addFragment(const std::set &F) { + // Create a new fragment to hold the layout for F. + Fragments.emplace_back(); + std::vector &Fragment = Fragments.back(); + uint64_t FragmentIndex = Fragments.size() - 1; + + for (auto ObjIndex : F) { + uint64_t OldFragmentIndex = FragmentMap[ObjIndex]; + if (OldFragmentIndex == 0) { + // We haven't seen this object index before, so just add it to the current + // fragment. + Fragment.push_back(ObjIndex); + } else { + // This index belongs to an existing fragment. Copy the elements of the + // old fragment into this one and clear the old fragment. We don't update + // the fragment map just yet, this ensures that any further references to + // indices from the old fragment in this fragment do not insert any more + // indices. + std::vector &OldFragment = Fragments[OldFragmentIndex]; + Fragment.insert(Fragment.end(), OldFragment.begin(), OldFragment.end()); + OldFragment.clear(); + } + } + + // Update the fragment map to point our object indices to this fragment. + for (uint64_t ObjIndex : Fragment) + FragmentMap[ObjIndex] = FragmentIndex; +} + namespace { struct LowerBitSets : public ModulePass { @@ -485,27 +514,66 @@ // Build the list of bitsets and referenced globals in this disjoint set. std::vector BitSets; std::vector Globals; + llvm::DenseMap BitSetIndices; + llvm::DenseMap GlobalIndices; for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); MI != GlobalClasses.member_end(); ++MI) { - if ((*MI).is()) + if ((*MI).is()) { + BitSetIndices[MI->get()] = BitSets.size(); BitSets.push_back(MI->get()); - else + } else { + GlobalIndices[MI->get()] = Globals.size(); Globals.push_back(MI->get()); + } + } + + // For each bitset, build a set of indices that refer to globals referenced + // by the bitset. + std::vector> BitSetMembers(BitSets.size()); + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + // Op = { bitset name, global, offset } + if (!Op->getOperand(1)) + continue; + auto I = BitSetIndices.find(cast(Op->getOperand(0))); + if (I == BitSetIndices.end()) + continue; + + auto OpGlobal = cast( + cast(Op->getOperand(1))->getValue()); + BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); + } } - // Order bitsets and globals by name for determinism. TODO: We may later - // want to use a more sophisticated ordering that lays out globals so as to - // minimize the sizes of the bitsets. + // Order the sets of indices by size. The GlobalLayoutBuilder works best + // when given small index sets first. + std::stable_sort( + BitSetMembers.begin(), BitSetMembers.end(), + [](const std::set &O1, const std::set &O2) { + return O1.size() < O2.size(); + }); + + // Create a GlobalLayoutBuilder and provide it with index sets as layout + // fragments. The GlobalLayoutBuilder tries to lay out members of fragments + // as close together as possible. + GlobalLayoutBuilder GLB(Globals.size()); + for (auto &&MemSet : BitSetMembers) + GLB.addFragment(MemSet); + + // Build a vector of globals with the computed layout. + std::vector OrderedGlobals(Globals.size()); + auto OGI = OrderedGlobals.begin(); + for (auto &&F : GLB.Fragments) + for (auto &&Offset : F) + *OGI++ = Globals[Offset]; + + // Order bitsets by name for determinism. std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) { return S1->getString() < S2->getString(); }); - std::sort(Globals.begin(), Globals.end(), - [](GlobalVariable *GV1, GlobalVariable *GV2) { - return GV1->getName() < GV2->getName(); - }); // Build the bitsets from this disjoint set. - buildBitSetsFromGlobals(M, BitSets, Globals); + buildBitSetsFromGlobals(M, BitSets, OrderedGlobals); } return true; Index: test/Transforms/LowerBitSets/layout.ll =================================================================== --- /dev/null +++ test/Transforms/LowerBitSets/layout.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -lowerbitsets < %s | FileCheck %s + +target datalayout = "e-p:32:32" + +; Tests that this set of globals is laid out according to our layout algorithm +; (see GlobalLayoutBuilder in include/llvm/Transforms/IPO/LowerBitSets.h). +; The chosen layout in this case is a, e, b, d, c. + +; CHECK: private constant { i32, i32, i32, i32, i32 } { i32 1, i32 5, i32 2, i32 4, i32 3 } +@a = constant i32 1 +@b = constant i32 2 +@c = constant i32 3 +@d = constant i32 4 +@e = constant i32 5 + +!0 = !{!"bitset1", i32* @a, i32 0} +!1 = !{!"bitset1", i32* @b, i32 0} +!2 = !{!"bitset1", i32* @c, i32 0} + +!3 = !{!"bitset2", i32* @b, i32 0} +!4 = !{!"bitset2", i32* @d, i32 0} + +!5 = !{!"bitset3", i32* @a, i32 0} +!6 = !{!"bitset3", i32* @e, i32 0} + +!llvm.bitsets = !{ !0, !1, !2, !3, !4, !5, !6 } + +declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone + +define void @foo() { + %x = call i1 @llvm.bitset.test(i8* undef, metadata !"bitset1") + %y = call i1 @llvm.bitset.test(i8* undef, metadata !"bitset2") + %z = call i1 @llvm.bitset.test(i8* undef, metadata !"bitset3") + ret void +} Index: unittests/Transforms/IPO/LowerBitSets.cpp =================================================================== --- unittests/Transforms/IPO/LowerBitSets.cpp +++ unittests/Transforms/IPO/LowerBitSets.cpp @@ -62,3 +62,30 @@ } } } + +TEST(LowerBitSets, GlobalLayoutBuilder) { + struct { + uint64_t NumObjects; + std::vector> Fragments; + std::vector WantLayout; + } GLBTests[] = { + {0, {}, {}}, + {4, {{0, 1}, {2, 3}}, {0, 1, 2, 3}}, + {3, {{0, 1}, {1, 2}}, {0, 1, 2}}, + {4, {{0, 1}, {1, 2}, {2, 3}}, {0, 1, 2, 3}}, + {4, {{0, 1}, {2, 3}, {1, 2}}, {0, 1, 2, 3}}, + {6, {{2, 5}, {0, 1, 2, 3, 4, 5}}, {0, 1, 2, 5, 3, 4}}, + }; + + for (auto &&T : GLBTests) { + GlobalLayoutBuilder GLB(T.NumObjects); + for (auto &&F : T.Fragments) + GLB.addFragment(F); + + std::vector ComputedLayout; + for (auto &&F : GLB.Fragments) + ComputedLayout.insert(ComputedLayout.end(), F.begin(), F.end()); + + EXPECT_EQ(T.WantLayout, ComputedLayout); + } +}