diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -1996,12 +1996,15 @@ SmallVectorImpl &NewArgv, bool MarkEOLs = false); -/// Tokenizes a Windows command line which may contain quotes and escaped -/// quotes. +/// Tokenizes a string of Windows command line arguments, which may contain +/// quotes and escaped quotes. /// /// See MSDN docs for CommandLineToArgvW for information on the quoting rules. /// http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx /// +/// For handling a full Windows command line including the executable name at +/// the start, see TokenizeWindowsCommandLineFull below. +/// /// \param [in] Source The string to be split on whitespace with quotes. /// \param [in] Saver Delegates back to the caller for saving parsed strings. /// \param [in] MarkEOLs true if tokenizing a response file and you want end of @@ -2018,6 +2021,23 @@ void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver, SmallVectorImpl &NewArgv); +/// Tokenizes a Windows full command line, including command name at the start. +/// +/// This uses the same syntax rules as TokenizeWindowsCommandLine for all but +/// the first token. But the first token is expected to be parsed as the +/// executable file name in the way CreateProcess would do it, rather than the +/// way the C library startup code would do it: CreateProcess does not consider +/// that \ is ever an escape character (because " is not a valid filename char, +/// hence there's never a need to escape it to be used literally). +/// +/// Parameters are the same as for TokenizeWindowsCommandLine. In particular, +/// if you set MarkEOLs = true, then the first word of every line will be +/// parsed using the special rules for command names, making this function +/// suitable for parsing a file full of commands to execute. +void TokenizeWindowsCommandLineFull(StringRef Source, StringSaver &Saver, + SmallVectorImpl &NewArgv, + bool MarkEOLs = false); + /// String tokenization function type. Should be compatible with either /// Windows or Unix command line tokenizers. using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver, diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -918,21 +918,34 @@ return I - 1; } -// Windows treats whitespace, double quotes, and backslashes specially. +// Windows treats whitespace, double quotes, and backslashes specially, except +// when parsing the first token of a full command line, in which case +// backslashes are not special. static bool isWindowsSpecialChar(char C) { return isWhitespaceOrNull(C) || C == '\\' || C == '\"'; } +static bool isWindowsSpecialCharInCommandName(char C) { + return isWhitespaceOrNull(C) || C == '\"'; +} // Windows tokenization implementation. The implementation is designed to be // inlined and specialized for the two user entry points. -static inline void -tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, - function_ref AddToken, - bool AlwaysCopy, function_ref MarkEOL) { +static inline void tokenizeWindowsCommandLineImpl( + StringRef Src, StringSaver &Saver, function_ref AddToken, + bool AlwaysCopy, function_ref MarkEOL, bool InitialCommandName) { SmallString<128> Token; + // Sometimes, this function will be handling a full command line including an + // executable pathname at the start. In that situation, the initial pathname + // needs different handling from the following arguments, because when + // CreateProcess or cmd.exe scans the pathname, it doesn't treat \ as + // escaping the quote character, whereas when libc scans the rest of the + // command line, it does. + bool CommandName = InitialCommandName; + // Try to do as much work inside the state machine as possible. enum { INIT, UNQUOTED, QUOTED } State = INIT; + for (size_t I = 0, E = Src.size(); I < E; ++I) { switch (State) { case INIT: { @@ -947,19 +960,29 @@ if (I >= E) break; size_t Start = I; - while (I < E && !isWindowsSpecialChar(Src[I])) - ++I; + if (CommandName) { + while (I < E && !isWindowsSpecialCharInCommandName(Src[I])) + ++I; + } else { + while (I < E && !isWindowsSpecialChar(Src[I])) + ++I; + } StringRef NormalChars = Src.slice(Start, I); if (I >= E || isWhitespaceOrNull(Src[I])) { // No special characters: slice out the substring and start the next // token. Copy the string if the caller asks us to. AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars); - if (I < E && Src[I] == '\n') + if (I < E && Src[I] == '\n') { MarkEOL(); + CommandName = InitialCommandName; + } else { + CommandName = false; + } } else if (Src[I] == '\"') { Token += NormalChars; State = QUOTED; } else if (Src[I] == '\\') { + assert(!CommandName && "or else we'd have treated it as a normal char"); Token += NormalChars; I = parseBackslash(Src, I, Token); State = UNQUOTED; @@ -976,12 +999,16 @@ // token. AddToken(Saver.save(Token.str())); Token.clear(); - if (Src[I] == '\n') + if (Src[I] == '\n') { + CommandName = InitialCommandName; MarkEOL(); + } else { + CommandName = false; + } State = INIT; } else if (Src[I] == '\"') { State = QUOTED; - } else if (Src[I] == '\\') { + } else if (Src[I] == '\\' && !CommandName) { I = parseBackslash(Src, I, Token); } else { Token.push_back(Src[I]); @@ -999,7 +1026,7 @@ // Otherwise, end the quoted portion and return to the unquoted state. State = UNQUOTED; } - } else if (Src[I] == '\\') { + } else if (Src[I] == '\\' && !CommandName) { I = parseBackslash(Src, I, Token); } else { Token.push_back(Src[I]); @@ -1021,7 +1048,7 @@ NewArgv.push_back(nullptr); }; tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, - /*AlwaysCopy=*/true, OnEOL); + /*AlwaysCopy=*/true, OnEOL, false); } void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver, @@ -1029,7 +1056,19 @@ auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); }; auto OnEOL = []() {}; tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false, - OnEOL); + OnEOL, false); +} + +void cl::TokenizeWindowsCommandLineFull(StringRef Src, StringSaver &Saver, + SmallVectorImpl &NewArgv, + bool MarkEOLs) { + auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); }; + auto OnEOL = [&]() { + if (MarkEOLs) + NewArgv.push_back(nullptr); + }; + tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, + /*AlwaysCopy=*/true, OnEOL, true); } void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver, diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc --- a/llvm/lib/Support/Windows/Process.inc +++ b/llvm/lib/Support/Windows/Process.inc @@ -247,7 +247,7 @@ SmallVector TmpArgs; StringSaver Saver(Alloc); - cl::TokenizeWindowsCommandLine(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false); + cl::TokenizeWindowsCommandLineFull(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false); for (const char *Arg : TmpArgs) { EC = WildcardExpand(Arg, Args, Saver); diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp --- a/llvm/unittests/Support/CommandLineTest.cpp +++ b/llvm/unittests/Support/CommandLineTest.cpp @@ -257,6 +257,24 @@ testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input3, Output3); } +TEST(CommandLineTest, TokenizeWindowsCommandLineExeName) { + const char Input1[] = + R"("C:\Program Files\Whatever\"clang.exe z.c -DY=\"x\")"; + const char *const Output1[] = {"C:\\Program Files\\Whatever\\clang.exe", + "z.c", "-DY=\"x\""}; + testCommandLineTokenizer(cl::TokenizeWindowsCommandLineFull, Input1, Output1); + + const char Input2[] = "\"a\\\"b c\\\"d\n\"e\\\"f g\\\"h\n"; + const char *const Output2[] = {"a\\b", "c\"d", nullptr, + "e\\f", "g\"h", nullptr}; + testCommandLineTokenizer(cl::TokenizeWindowsCommandLineFull, Input2, Output2, + /*MarkEOLs=*/true); + + const char Input3[] = R"(\\server\share\subdir\clang.exe)"; + const char *const Output3[] = {"\\\\server\\share\\subdir\\clang.exe"}; + testCommandLineTokenizer(cl::TokenizeWindowsCommandLineFull, Input3, Output3); +} + TEST(CommandLineTest, TokenizeAndMarkEOLs) { // Clang uses EOL marking in response files to support options that consume // the rest of the arguments on the current line, but do not consume arguments