diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp --- a/lld/COFF/DriverUtils.cpp +++ b/lld/COFF/DriverUtils.cpp @@ -861,19 +861,25 @@ } // Tokenizes and parses a given string as command line in .drective section. -// /EXPORT options are processed in fastpath. ParsedDirectives ArgParser::parseDirectives(StringRef s) { ParsedDirectives result; SmallVector rest; - for (StringRef tok : tokenize(s)) { + // Handle /EXPORT and /INCLUDE in a fast path. These directives can appear for + // potentially every symbol in the object, so they must be handled quickly. + SmallVector tokens; + cl::TokenizeWindowsCommandLineNoCopy(s, saver, tokens); + for (StringRef tok : tokens) { if (tok.startswith_lower("/export:") || tok.startswith_lower("-export:")) result.exports.push_back(tok.substr(strlen("/export:"))); else if (tok.startswith_lower("/include:") || tok.startswith_lower("-include:")) result.includes.push_back(tok.substr(strlen("/include:"))); - else - rest.push_back(tok.data()); + else { + // Save non-null-terminated strings to make proper C strings. + bool HasNul = tok.data()[tok.size()] == '\0'; + rest.push_back(HasNul ? tok.data() : saver.save(tok).data()); + } } // Make InputArgList from unparsed string vectors. diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -2027,6 +2027,13 @@ SmallVectorImpl &NewArgv, bool MarkEOLs = false); +/// Tokenizes a Windows command line while attempting to avoid copies. If no +/// quoting or escaping was used, this produces substrings of the original +/// string. If a token requires unquoting, it will be allocated with the +/// StringSaver. +void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver, + SmallVectorImpl &NewArgv); + /// String tokenization function type. Should be compatible with either /// Windows or Unix command line tokenizers. using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver, diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -919,91 +919,118 @@ return I - 1; } -void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, - SmallVectorImpl &NewArgv, - bool MarkEOLs) { +// Windows treats whitespace, double quotes, and backslashes specially. +static bool isWindowsSpecialChar(char C) { + return isWhitespaceOrNull(C) || C == '\\' || C == '\"'; +} + +// Windows tokenization implementation. The implementation is designed to be +// inlined and specialized for the two user entry points. +static inline void +tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, + function_ref AddToken, + bool AlwaysCopy, function_ref MarkEOL) { SmallString<128> Token; - // This is a small state machine to consume characters until it reaches the - // end of the source string. + // Try to do as much work inside the state machine as possible. enum { INIT, UNQUOTED, QUOTED } State = INIT; - for (size_t I = 0, E = Src.size(); I != E; ++I) { - char C = Src[I]; - - // INIT state indicates that the current input index is at the start of - // the string or between tokens. - if (State == INIT) { - if (isWhitespaceOrNull(C)) { - // Mark the end of lines in response files - if (MarkEOLs && C == '\n') - NewArgv.push_back(nullptr); - continue; + for (size_t I = 0, E = Src.size(); I < E; ++I) { + switch (State) { + case INIT: { + assert(Token.empty() && "token should be empty in initial state"); + // Eat whitespace before a token. + while (I < E && isWhitespaceOrNull(Src[I])) { + if (Src[I] == '\n') + MarkEOL(); + ++I; } - if (C == '"') { + // Stop if this was trailing whitespace. + if (I >= E) + break; + size_t Start = I; + while (I < E && !isWindowsSpecialChar(Src[I])) + ++I; + StringRef NormalChars = Src.slice(Start, I); + if (I >= E || isWhitespaceOrNull(Src[I])) { + if (I < E && Src[I] == '\n') + MarkEOL(); + // No special characters: slice out the substring and start the next + // token. Copy the string if the caller asks us to. + AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars); + } else if (Src[I] == '\"') { + Token += NormalChars; State = QUOTED; - continue; - } - if (C == '\\') { + } else if (Src[I] == '\\') { + Token += NormalChars; I = parseBackslash(Src, I, Token); State = UNQUOTED; - continue; + } else { + llvm_unreachable("unexpected special character"); } - Token.push_back(C); - State = UNQUOTED; - continue; + break; } - // UNQUOTED state means that it's reading a token not quoted by double - // quotes. - if (State == UNQUOTED) { - // Whitespace means the end of the token. - if (isWhitespaceOrNull(C)) { - NewArgv.push_back(Saver.save(StringRef(Token)).data()); + case UNQUOTED: + if (isWhitespaceOrNull(Src[I])) { + // Whitespace means the end of the token. If we are in this state, the + // token must have contained a special character, so we must copy the + // token. + AddToken(Saver.save(Token.str())); Token.clear(); + if (Src[I] == '\n') + MarkEOL(); State = INIT; - // Mark the end of lines in response files - if (MarkEOLs && C == '\n') - NewArgv.push_back(nullptr); - continue; - } - if (C == '"') { + } else if (Src[I] == '\"') { State = QUOTED; - continue; - } - if (C == '\\') { + } else if (Src[I] == '\\') { I = parseBackslash(Src, I, Token); - continue; + } else { + Token.push_back(Src[I]); } - Token.push_back(C); - continue; - } + break; - // QUOTED state means that it's reading a token quoted by double quotes. - if (State == QUOTED) { - if (C == '"') { + case QUOTED: + if (Src[I] == '\"') { if (I < (E - 1) && Src[I + 1] == '"') { // Consecutive double-quotes inside a quoted string implies one // double-quote. Token.push_back('"'); - I = I + 1; - continue; + ++I; + } else { + // Otherwise, end the quoted portion and return to the unquoted state. + State = UNQUOTED; } - State = UNQUOTED; - continue; - } - if (C == '\\') { + } else if (Src[I] == '\\') { I = parseBackslash(Src, I, Token); - continue; + } else { + Token.push_back(Src[I]); } - Token.push_back(C); + break; } } - // Append the last token after hitting EOF with no whitespace. + if (!Token.empty()) - NewArgv.push_back(Saver.save(StringRef(Token)).data()); - // Mark the end of response files - if (MarkEOLs) - NewArgv.push_back(nullptr); + AddToken(Saver.save(Token.str())); +} + +void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, + SmallVectorImpl &NewArgv, + bool MarkEOLs) { + auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); }; + auto OnEOL = [&]() { + if (MarkEOLs) + NewArgv.push_back(nullptr); + }; + tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, + /*AlwaysCopy=*/true, OnEOL); +} + +void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver, + SmallVectorImpl &NewArgv) { + auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); }; + auto OnEOL = []() {}; + tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false, + OnEOL); } void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,