diff --git a/libcxxabi/src/demangle/Utility.h b/libcxxabi/src/demangle/Utility.h --- a/libcxxabi/src/demangle/Utility.h +++ b/libcxxabi/src/demangle/Utility.h @@ -126,6 +126,16 @@ return this->operator<<(static_cast(N)); } + void insert(size_t Pos, const char *S, size_t N) { + assert(Pos <= CurrentPosition); + if (N == 0) + return; + grow(N); + std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos); + std::memcpy(Buffer + Pos, S, N); + CurrentPosition += N; + } + size_t getCurrentPosition() const { return CurrentPosition; } void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; } diff --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h --- a/llvm/include/llvm/Demangle/Utility.h +++ b/llvm/include/llvm/Demangle/Utility.h @@ -126,6 +126,16 @@ return this->operator<<(static_cast(N)); } + void insert(size_t Pos, const char *S, size_t N) { + assert(Pos <= CurrentPosition); + if (N == 0) + return; + grow(N); + std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos); + std::memcpy(Buffer + Pos, S, N); + CurrentPosition += N; + } + size_t getCurrentPosition() const { return CurrentPosition; } void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; } diff --git a/llvm/lib/Demangle/RustDemangle.cpp b/llvm/lib/Demangle/RustDemangle.cpp --- a/llvm/lib/Demangle/RustDemangle.cpp +++ b/llvm/lib/Demangle/RustDemangle.cpp @@ -135,6 +135,7 @@ void printDecimalNumber(uint64_t N); void printBasicType(BasicType); void printLifetime(uint64_t Index); + void printIdentifier(Identifier Ident); char look() const; char consume(); @@ -283,8 +284,7 @@ switch (consume()) { case 'C': { parseOptionalBase62Number('s'); - Identifier Ident = parseIdentifier(); - print(Ident.Name); + printIdentifier(parseIdentifier()); break; } case 'M': { @@ -333,7 +333,7 @@ print(NS); if (!Ident.empty()) { print(":"); - print(Ident.Name); + printIdentifier(Ident); } print('#'); printDecimalNumber(Disambiguator); @@ -342,7 +342,7 @@ // Implementation internal namespaces. if (!Ident.empty()) { print("::"); - print(Ident.Name); + printIdentifier(Ident); } } break; @@ -669,6 +669,8 @@ print("C"); } else { Identifier Ident = parseIdentifier(); + if (Ident.Punycode) + Error = true; for (char C : Ident.Name) { // When mangling ABI string, the "-" is replaced with "_". if (C == '_') @@ -1078,6 +1080,172 @@ } } +static inline bool decodePunycodeDigit(char C, size_t &Value) { + if (isLower(C)) { + Value = C - 'a'; + return true; + } + + if (isDigit(C)) { + Value = 26 + (C - '0'); + return true; + } + + return false; +} + +static void removeNullBytes(OutputStream &Output, size_t StartIdx) { + char *Buffer = Output.getBuffer(); + char *Start = Buffer + StartIdx; + char *End = Buffer + Output.getCurrentPosition(); + Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer); +} + +// Encodes code point as UTF-8 and stores results in Output. Returns false if +// CodePoint is not a valid unicode scalar value. +static inline bool encodeUTF8(size_t CodePoint, char *Output) { + if (0xD800 <= CodePoint && CodePoint <= 0xDFFF) + return false; + + if (CodePoint <= 0x7F) { + Output[0] = CodePoint; + return true; + } + + if (CodePoint <= 0x7FF) { + Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F); + Output[1] = 0x80 | (CodePoint & 0x3F); + return true; + } + + if (CodePoint <= 0xFFFF) { + Output[0] = 0xE0 | (CodePoint >> 12); + Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F); + Output[2] = 0x80 | (CodePoint & 0x3F); + return true; + } + + if (CodePoint <= 0x10FFFF) { + Output[0] = 0xF0 | (CodePoint >> 18); + Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F); + Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F); + Output[3] = 0x80 | (CodePoint & 0x3F); + return true; + } + + return false; +} + +// Decodes string encoded using punycode and appends results to Output. +// Returns true if decoding was successful. +static bool decodePunycode(StringView Input, OutputStream &Output) { + size_t OutputSize = Output.getCurrentPosition(); + size_t InputIdx = 0; + + // Rust uses an underscore as a delimiter. + size_t DelimiterPos = StringView::npos; + for (size_t I = 0; I != Input.size(); ++I) + if (Input[I] == '_') + DelimiterPos = I; + + if (DelimiterPos != StringView::npos) { + // Copy basic code points before the last delimiter to the output. + for (; InputIdx != DelimiterPos; ++InputIdx) { + char C = Input[InputIdx]; + if (!isValid(C)) + return false; + // Code points are padded with zeros while decoding is in progress. + char UTF8[4] = {C}; + Output += StringView(UTF8, UTF8 + 4); + } + // Skip over the delimiter. + ++InputIdx; + } + + size_t Base = 36; + size_t Skew = 38; + size_t Bias = 72; + size_t N = 0x80; + size_t TMin = 1; + size_t TMax = 26; + size_t Damp = 700; + + auto Adapt = [&](size_t Delta, size_t NumPoints) { + Delta /= Damp; + Delta += Delta / NumPoints; + Damp = 2; + + size_t K = 0; + while (Delta > (Base - TMin) * TMax / 2) { + Delta /= Base - TMin; + K += Base; + } + return K + (((Base - TMin + 1) * Delta) / (Delta + Skew)); + }; + + // Main decoding loop. + for (size_t I = 0; InputIdx != Input.size(); I += 1) { + size_t OldI = I; + size_t W = 1; + size_t Max = std::numeric_limits::max(); + for (size_t K = Base; true; K += Base) { + if (InputIdx == Input.size()) + return false; + char C = Input[InputIdx++]; + size_t Digit = 0; + if (!decodePunycodeDigit(C, Digit)) + return false; + + if (Digit > (Max - I) / W) + return false; + I += Digit * W; + + size_t T; + if (K <= Bias) + T = TMin; + else if (K >= Bias + TMax) + T = TMax; + else + T = K - Bias; + + if (Digit < T) + break; + + if (W > Max / (Base - T)) + return false; + W *= (Base - T); + } + size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1; + Bias = Adapt(I - OldI, NumPoints); + + if (I / NumPoints > Max - N) + return false; + N += I / NumPoints; + I = I % NumPoints; + + // Insert N at position I in the output. + char UTF8[4] = {}; + if (!encodeUTF8(N, UTF8)) + return false; + Output.insert(OutputSize + I * 4, UTF8, 4); + } + + removeNullBytes(Output, OutputSize); + return true; +} + +void Demangler::printIdentifier(Identifier Ident) { + if (Error || !Print) + return; + + if (Ident.Punycode) { + if (!decodePunycode(Ident.Name, Output)) + Error = true; + } else { + print(Ident.Name); + } +} + char Demangler::look() const { if (Error || Position >= Input.size()) return 0; diff --git a/llvm/test/Demangle/rust.test b/llvm/test/Demangle/rust.test --- a/llvm/test/Demangle/rust.test +++ b/llvm/test/Demangle/rust.test @@ -237,6 +237,11 @@ CHECK: function:: _RIC8functionFUK21C_cmse_nonsecure_callEuE +; Invalid ABI with punycode. + +CHECK: _RIC8functionFKu3n3hEuE + _RIC8functionFKu3n3hEuE + ; Trait objects CHECK: trait:: @@ -456,6 +461,44 @@ CHECK: dot (.llvm.6789) _RC3dotC5crate.llvm.6789 +; Punycode + +CHECK: punycode::東京 + _RNvC8punycodeu7_1lqs71d + +CHECK: punycode::zażółć_gęślą_jaźń + _RNvC8punycodeu29za_gl_ja_w3a7psa2tqtgb10airva + +CHECK: punycode::საჭმელად_გემრიელი_სადილი + _RNvC8punycodeu30____7hkackfecea1cbdathfdh9hlq6y + +CHECK: Gödel::Escher::Bach + _RNtNvCu8Gdel_5qa6Escher4Bach + +CHECK: punycode::🦁🐅 + _RNvC8punycodeu7wn8hx1g + +; Punycode - invalid code point + +CHECK: _RCu5r731r + _RCu5r731r + +CHECK: _RCu8b44444yy + _RCu8b44444yy + +CHECK: _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz + _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz + +; Punycode - early EOF + +CHECK: _RCu8_CCCAR_u4 + _RCu8_CCCAR_u4 + +; Punycode - overflow + +CHECK: _RNvC1au21p18888888888888888888 + _RNvC1au21p18888888888888888888 + ; Invalid mangled characters CHECK: _RNvC2a.1c