Index: lld/trunk/ELF/ICF.cpp =================================================================== --- lld/trunk/ELF/ICF.cpp +++ lld/trunk/ELF/ICF.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// ICF is short for Identical Code Folding. That is a size optimization to +// ICF is short for Identical Code Folding. This is a size optimization to // identify and merge two or more read-only sections (typically functions) // that happened to have the same contents. It usually reduces output size // by a few percent. @@ -26,43 +26,50 @@ // void bar() { foo(); } // // If you merge the two, their relocations point to the same section and -// thus you know they are mergeable, but how do we know they are mergeable -// in the first place? This is not an easy problem to solve. +// thus you know they are mergeable, but how do you know they are +// mergeable in the first place? This is not an easy problem to solve. // -// What we are doing in LLD is some sort of coloring algorithm. +// What we are doing in LLD is to partition sections into equivalence +// classes. Sections in the same equivalence class when the algorithm +// terminates are considered identical. Here are details: +// +// 1. First, we partition sections using their hash values as keys. Hash +// values contain section types, section contents and numbers of +// relocations. During this step, relocation targets are not taken into +// account. We just put sections that apparently differ into different +// equivalence classes. +// +// 2. Next, for each equivalence class, we visit sections to compare +// relocation targets. Relocation targets are considered equivalent if +// their targets are in the same equivalence class. Sections with +// different relocation targets are put into different equivalence +// clases. +// +// 3. If we split an equivalence class in step 2, two relocations +// previously target the same equivalence class may now target +// different equivalence classes. Therefore, we repeat step 2 until a +// convergence is obtained. // -// We color non-identical sections in different colors repeatedly. -// Sections in the same color when the algorithm terminates are considered -// identical. Here are the details: -// -// 1. First, we color all sections using their hash values of section -// types, section contents, and numbers of relocations. At this moment, -// relocation targets are not taken into account. We just color -// sections that apparently differ in different colors. -// -// 2. Next, for each color C, we visit sections in color C to compare -// relocation target colors. We recolor sections A and B in different -// colors if A's and B's relocations are different in terms of target -// colors. -// -// 3. If we recolor some section in step 2, relocations that were -// previously pointing to the same color targets may now be pointing to -// different colors. Therefore, repeat 2 until a convergence is -// obtained. -// -// 4. For each color C, pick an arbitrary section in color C, and merges -// other sections in color C with it. +// 4. For each equivalence class C, pick an arbitrary section in C, and +// merge all the other sections in C with it. // // For small programs, this algorithm needs 3-5 iterations. For large // programs such as Chromium, it takes more than 20 iterations. // +// This algorithm was mentioned as an "optimistic algorithm" in [1], +// though gold implements a different algorithm than this. +// // We parallelize each step so that multiple threads can work on different -// colors concurrently. That gave us a large performance boost when -// applying ICF on large programs. For example, MSVC link.exe or GNU gold -// takes 10-20 seconds to apply ICF on Chromium, whose output size is -// about 1.5 GB, but LLD can finish it in less than 2 seconds on a 2.8 GHz -// 40 core machine. Even without threading, LLD's ICF is still faster than -// MSVC or gold though. +// equivalence classes concurrently. That gave us a large performance +// boost when applying ICF on large programs. For example, MSVC link.exe +// or GNU gold takes 10-20 seconds to apply ICF on Chromium, whose output +// size is about 1.5 GB, but LLD can finish it in less than 2 seconds on a +// 2.8 GHz 40 core machine. Even without threading, LLD's ICF is still +// faster than MSVC or gold though. +// +// [1] Safe ICF: Pointer Safe and Unwinding aware Identical Code Folding +// in the Gold Linker +// http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36912.pdf // //===----------------------------------------------------------------------===// @@ -103,10 +110,10 @@ size_t findBoundary(size_t Begin, size_t End); - void forEachColorRange(size_t Begin, size_t End, + void forEachClassRange(size_t Begin, size_t End, std::function Fn); - void forEachColor(std::function Fn); + void forEachClass(std::function Fn); std::vector *> Sections; @@ -116,26 +123,28 @@ // The main loop counter. int Cnt = 0; - // We have two locations for colors. On the first iteration of the main - // loop, Color[0] has a valid value, and Color[1] contains garbage. We - // read colors from slot 0 and write to slot 1. So, Color[0] represents - // the current color, and Color[1] represents the next color. On each - // iteration, they switch the roles, so we use them alternately. + // We have two locations for equivalence classes. On the first iteration + // of the main loop, Class[0] has a valid value, and Class[1] contains + // garbage. We read equivalence classes from slot 0 and write to slot 1. + // So, Class[0] represents the current class, and Class[1] represents + // the next class. On each iteration, we switch their roles and use them + // alternately. // // Why are we doing this? Recall that other threads may be working on - // other colors in parallel. They may read colors that we are updating. - // We cannot update colors in place because it breaks the invariance - // that all possibly-identical sections must have the same color at any - // moment. In other words, the for loop to update colors is not an - // atomic operation, and that is observable from other threads. By - // writing new colors to write-only places, we can keep the invariance. + // other equivalence classes in parallel. They may read sections that we + // are updating. We cannot update equivalence classes in place because + // it breaks the invariance that all possibly-identical sections must be + // in the same equivalence class at any moment. In other words, the for + // loop to update equivalence classes is not atomic, and that is + // observable from other threads. By writing new classes to other + // places, we can keep the invariance. // - // Below, `Current` has the index of the current color, and `Next` has - // the index of the next color. If threading is enabled, they are - // either (0, 1) or (1, 0). + // Below, `Current` has the index of the current class, and `Next` has + // the index of the next class. If threading is enabled, they are either + // (0, 1) or (1, 0). // // Note on single-thread: if that's the case, they are always (0, 0) - // because we can safely read next colors without worrying about race + // because we can safely read the next class without worrying about race // conditions. Using the same location makes this algorithm converge // faster because it uses results of the same iteration earlier. int Current = 0; @@ -158,8 +167,7 @@ S->Name != ".init" && S->Name != ".fini"; } -// Split a range into smaller ranges by recoloring sections -// in a given range. +// Split an equivalence class into smaller classes. template void ICF::segregate(size_t Begin, size_t End, bool Constant) { // This loop rearranges sections in [Begin, End) so that all sections @@ -183,10 +191,10 @@ size_t Mid = Bound - Sections.begin(); // Now we split [Begin, End) into [Begin, Mid) and [Mid, End) by - // updating the sections in [Begin, End). We use Mid as a color ID - // because every group ends with a unique index. + // updating the sections in [Begin, End). We use Mid as an equivalence + // class ID because every group ends with a unique index. for (size_t I = Begin; I < Mid; ++I) - Sections[I]->Color[Next] = Mid; + Sections[I]->Class[Next] = Mid; // If we created a group, we need to iterate the main loop again. if (Mid != End) @@ -237,7 +245,7 @@ if (&SA == &SB) return true; - // Or, the two sections must have the same color. + // Or, the two sections must be in the same equivalence class. auto *DA = dyn_cast>(&SA); auto *DB = dyn_cast>(&SB); if (!DA || !DB) @@ -250,12 +258,12 @@ if (!X || !Y) return false; - // Ineligible sections have the special color 0. - // They can never be the same in terms of section colors. - if (X->Color[Current] == 0) + // Ineligible sections are in the special equivalence class 0. + // They can never be the same in terms of the equivalence class. + if (X->Class[Current] == 0) return false; - return X->Color[Current] == Y->Color[Current]; + return X->Class[Current] == Y->Class[Current]; }; return std::equal(RelsA.begin(), RelsA.end(), RelsB.begin(), Eq); @@ -271,21 +279,22 @@ } template size_t ICF::findBoundary(size_t Begin, size_t End) { + uint32_t Class = Sections[Begin]->Class[Current]; for (size_t I = Begin + 1; I < End; ++I) - if (Sections[Begin]->Color[Current] != Sections[I]->Color[Current]) + if (Class != Sections[I]->Class[Current]) return I; return End; } -// Sections in the same color are contiguous in Sections vector. -// Therefore, Sections vector can be considered as contiguous groups -// of sections, grouped by colors. +// Sections in the same equivalence class are contiguous in Sections +// vector. Therefore, Sections vector can be considered as contiguous +// groups of sections, grouped by the class. // // This function calls Fn on every group that starts within [Begin, End). // Note that a group must starts in that range but doesn't necessarily // have to end before End. template -void ICF::forEachColorRange(size_t Begin, size_t End, +void ICF::forEachClassRange(size_t Begin, size_t End, std::function Fn) { if (Begin > 0) Begin = findBoundary(Begin - 1, End); @@ -297,13 +306,13 @@ } } -// Call Fn on each color group. +// Call Fn on each equivalence class. template -void ICF::forEachColor(std::function Fn) { +void ICF::forEachClass(std::function Fn) { // If threading is disabled or the number of sections are // too small to use threading, call Fn sequentially. if (!Config->Threads || Sections.size() < 1024) { - forEachColorRange(0, Sections.size(), Fn); + forEachClassRange(0, Sections.size(), Fn); ++Cnt; return; } @@ -315,8 +324,8 @@ size_t NumShards = 256; size_t Step = Sections.size() / NumShards; forLoop(0, NumShards, - [&](size_t I) { forEachColorRange(I * Step, (I + 1) * Step, Fn); }); - forEachColorRange(Step * NumShards, Sections.size(), Fn); + [&](size_t I) { forEachClassRange(I * Step, (I + 1) * Step, Fn); }); + forEachClassRange(Step * NumShards, Sections.size(), Fn); ++Cnt; } @@ -328,34 +337,32 @@ if (isEligible(S)) Sections.push_back(S); - // Initially, we use hash values to color sections. Therefore, if - // two sections have the same color, they are likely (but not - // guaranteed) to have the same static contents in terms of ICF. + // Initially, we use hash values to partition sections. for (InputSection *S : Sections) - // Set MSB to 1 to avoid collisions with non-hash colors. - S->Color[0] = getHash(S) | (1 << 31); + // Set MSB to 1 to avoid collisions with non-hash IDs. + S->Class[0] = getHash(S) | (1 << 31); - // From now on, sections in Sections are ordered so that sections in - // the same color are consecutive in the vector. + // From now on, sections in Sections vector are ordered so that sections + // in the same equivalence class are consecutive in the vector. std::stable_sort(Sections.begin(), Sections.end(), [](InputSection *A, InputSection *B) { - return A->Color[0] < B->Color[0]; + return A->Class[0] < B->Class[0]; }); // Compare static contents and assign unique IDs for each static content. - forEachColor([&](size_t Begin, size_t End) { segregate(Begin, End, true); }); + forEachClass([&](size_t Begin, size_t End) { segregate(Begin, End, true); }); // Split groups by comparing relocations until convergence is obtained. do { Repeat = false; - forEachColor( + forEachClass( [&](size_t Begin, size_t End) { segregate(Begin, End, false); }); } while (Repeat); log("ICF needed " + Twine(Cnt) + " iterations"); - // Merge sections in the same colors. - forEachColor([&](size_t Begin, size_t End) { + // Merge sections by the equivalence class. + forEachClass([&](size_t Begin, size_t End) { if (End - Begin == 1) return; Index: lld/trunk/ELF/InputSection.h =================================================================== --- lld/trunk/ELF/InputSection.h +++ lld/trunk/ELF/InputSection.h @@ -289,7 +289,7 @@ void relocateNonAlloc(uint8_t *Buf, llvm::ArrayRef Rels); // Used by ICF. - uint32_t Color[2] = {0, 0}; + uint32_t Class[2] = {0, 0}; // Called by ICF to merge two input sections. void replace(InputSection *Other);