This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/
-
lib/Lex/
-
Lex/
-
DependencyDirectivesSourceMinimizer.cpp
-
unittests/Lex/
-
Lex/
1
DependencyDirectivesSourceMinimizerTest.cpp

Differential D104459

[clang][lex] Ensure minimizer output is never larger than input
AbandonedPublic

Authored by jansvoboda11 on Jun 17 2021, 6:34 AM.

Download Raw Diff

Details

Reviewers

Bigcheese
dexonsmith
arphaman

Summary

This patch ensures the output of source minimizer is never larger than the input. This property will be handy in a follow up patch that makes it possible to pad the minimized output to the size of the original input.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jansvoboda11 requested review of this revision.Jun 17 2021, 6:34 AM

jansvoboda11 created this revision.

Herald added a project: Restricted Project. · View Herald TranscriptJun 17 2021, 6:34 AM

Herald added a subscriber: cfe-commits. · View Herald Transcript

jansvoboda11 added a child revision: D104460: [clang][lex] NFC: Extract source variable in minimizer tests.Jun 17 2021, 6:37 AM

jansvoboda11 added a child revision: D104462: [clang][lex] Add minimizer option to pad the output to the input size.Jun 17 2021, 7:25 AM

Fix assert wording

This patch ensures the output of source minimizer is never larger than the input. This property will be handy in a follow up patch that makes it possible to pad the minimized output to the size of the original input.

I suppose when I wrote first wrote this I was thinking of a secondary purpose of canonicalizing the sources; in which case, adding a trailing newline (for example) seems like feature rather than a bug. I'm not too attached to that, but I am curious for more context about why it's important to be able to pad the file to the same size, and whether that indicates a bug that should be fixed somewhere else instead. (I'll read the follow-up patches in the stack...)

clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp
149	I'm not comfortable with this change... the idea of the `(/invalid/` was to guarantee that an invalid macro argument list didn't get minimized to something valid. Perhaps the result could be `"#define MACRO(\n"`, and just strip the comment?

Another thing to consider: a client that wants the minimized source to be "no bigger than" the original source can use the original source if it ends up growing. For example, https://reviews.llvm.org/D104462 could check the resulting size, and just return the original source in the (extremely unlikely?) corner case where the minimized sources are bigger.

In that case, we wouldn't need this patch's assertion.

I bet some of these changes are good to do anyway (like dropping the /*invalid*/ comment, which was useful as a canary-in-the-coal-mine when testing the prototype on large bodies of sources (and maybe has outlived its usefulness)), but you could skip the fiddly changes here that complicate the logic and make the output less canonical and harder to read, like "maintain 0 whitespace after macro names" and "maintain missing newline at EOF".

Harbormaster completed remote builds in B109717: Diff 352726.Jun 17 2021, 5:04 PM

jansvoboda11 abandoned this revision.Jul 15 2021, 2:08 PM

Revision Contents

Path

Size

clang/

lib/

Lex/

DependencyDirectivesSourceMinimizer.cpp

37 lines

unittests/

Lex/

DependencyDirectivesSourceMinimizerTest.cpp

14 lines

Diff 352726

clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp

Show First 20 Lines • Show All 87 Lines • ▼ Show 20 Lines	private:

void printToNewline(const char &First, const char const End);		void printToNewline(const char &First, const char const End);
void printAdjacentModuleNameParts(const char &First, const char const End);		void printAdjacentModuleNameParts(const char &First, const char const End);
LLVM_NODISCARD bool printAtImportBody(const char *&First,		LLVM_NODISCARD bool printAtImportBody(const char *&First,
const char *const End);		const char *const End);
void printDirectiveBody(const char &First, const char const End);		void printDirectiveBody(const char &First, const char const End);
void printAdjacentMacroArgs(const char &First, const char const End);		void printAdjacentMacroArgs(const char &First, const char const End);
LLVM_NODISCARD bool printMacroArgs(const char &First, const char const End);		LLVM_NODISCARD bool printMacroArgs(const char &First, const char const End);
		void maybePrintNewLine(const char &Last, const char const End);

/// Reports a diagnostic if the diagnostic engine is provided. Always returns		/// Reports a diagnostic if the diagnostic engine is provided. Always returns
/// true at the end.		/// true at the end.
bool reportError(const char *CurPtr, unsigned Err);		bool reportError(const char *CurPtr, unsigned Err);

StringMap<char> SplitIds;		StringMap<char> SplitIds;
StringRef Input;		StringRef Input;
DiagnosticsEngine *Diags;		DiagnosticsEngine *Diags;
▲ Show 20 Lines • Show All 337 Lines • ▼ Show 20 Lines	append(First, findFirstTrailingSpace(
First, LastBeforeTrailingSpace - 1));		First, LastBeforeTrailingSpace - 1));

First = Last;		First = Last;
skipNewline(First, End);		skipNewline(First, End);
skipOverSpaces(First, End);		skipOverSpaces(First, End);
}		}
}		}

static void skipWhitespace(const char &First, const char const End) {		static size_t skipWhitespace(const char &First, const char const End) {
		const char *FirstBefore = First;

for (;;) {		for (;;) {
assert(First <= End);		assert(First <= End);
skipOverSpaces(First, End);		skipOverSpaces(First, End);

if (End - First < 2)		if (End - First < 2)
return;		break;

if (First[0] == '\\' && isVerticalWhitespace(First[1])) {		if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
skipNewline(++First, End);		skipNewline(++First, End);
continue;		continue;
}		}

// Check for a non-comment character.		// Check for a non-comment character.
if (First[0] != '/')		if (First[0] != '/')
return;		break;

// "// ...".		// "// ...".
if (First[1] == '/') {		if (First[1] == '/') {
skipLineComment(First, End);		skipLineComment(First, End);
return;		break;
}		}

// Cannot be a comment.		// Cannot be a comment.
if (First[1] != '*')		if (First[1] != '*')
return;		break;

// "/.../".		// "/.../".
skipBlockComment(First, End);		skipBlockComment(First, End);
}		}

		return First - FirstBefore;
}		}

void Minimizer::printAdjacentModuleNameParts(const char *&First,		void Minimizer::printAdjacentModuleNameParts(const char *&First,
const char *const End) {		const char *const End) {
// Skip over parts of the body.		// Skip over parts of the body.
const char *Last = First;		const char *Last = First;
do		do
++Last;		++Last;
Show All 27 Lines	bool Minimizer::printAtImportBody(const char &First, const char const End) {
}		}
}		}

void Minimizer::printDirectiveBody(const char &First, const char const End) {		void Minimizer::printDirectiveBody(const char &First, const char const End) {
skipWhitespace(First, End); // Skip initial whitespace.		skipWhitespace(First, End); // Skip initial whitespace.
printToNewline(First, End);		printToNewline(First, End);
while (Out.back() == ' ')		while (Out.back() == ' ')
Out.pop_back();		Out.pop_back();
put('\n');		maybePrintNewLine(First, End);
}		}

LLVM_NODISCARD static const char lexRawIdentifier(const char First,		LLVM_NODISCARD static const char lexRawIdentifier(const char First,
const char *const End) {		const char *const End) {
assert(isIdentifierBody(*First) && "invalid identifer");		assert(isIdentifierBody(*First) && "invalid identifer");
const char *Last = First + 1;		const char *Last = First + 1;
while (Last != End && isIdentifierBody(*Last))		while (Last != End && isIdentifierBody(*Last))
++Last;		++Last;
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	for (;;) {
// This is intentionally fairly liberal.		// This is intentionally fairly liberal.
if (!(isIdentifierBody(First) \|\| First == '.' \|\| *First == ','))		if (!(isIdentifierBody(First) \|\| First == '.' \|\| *First == ','))
return true;		return true;

printAdjacentMacroArgs(First, End);		printAdjacentMacroArgs(First, End);
}		}
}		}

		void Minimizer::maybePrintNewLine(const char &Last, const char const End) {
		// Only print newline if doing so won't make the output larger than the input.
		if (Last != End)
		put('\n');
		}

/// Looks for an identifier starting from Last.		/// Looks for an identifier starting from Last.
///		///
/// Updates "First" to just past the next identifier, if any. Returns true iff		/// Updates "First" to just past the next identifier, if any. Returns true iff
/// the identifier matches "Id".		/// the identifier matches "Id".
bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,		bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
const char *const End) {		const char *const End) {
skipWhitespace(First, End);		skipWhitespace(First, End);
if (First == End \|\| !isIdentifierHead(*First))		if (First == End \|\| !isIdentifierHead(*First))
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	bool Minimizer::lexDefine(const char &First, const char const End) {
if (Last == End)		if (Last == End)
return false;		return false;
if (*Last == '(') {		if (*Last == '(') {
size_t Size = Out.size();		size_t Size = Out.size();
if (printMacroArgs(Last, End)) {		if (printMacroArgs(Last, End)) {
// Be robust to bad macro arguments, since they can show up in disabled		// Be robust to bad macro arguments, since they can show up in disabled
// code.		// code.
Out.resize(Size);		Out.resize(Size);
append("(/* invalid */\n");		maybePrintNewLine(Last, End);
skipLine(Last, End);		skipLine(Last, End);
return false;		return false;
}		}
}		}
skipWhitespace(Last, End);		size_t WhitespaceLength = skipWhitespace(Last, End);
if (Last == End)		if (Last == End)
return false;		return false;
if (!isVerticalWhitespace(*Last))		// Only print space if we actually skipped some whitespace. This prevents
		// making the output larger than the input.
		if (WhitespaceLength > 0 && !isVerticalWhitespace(*Last))
put(' ');		put(' ');
printDirectiveBody(Last, End);		printDirectiveBody(Last, End);
First = Last;		First = Last;
return false;		return false;
}		}

bool Minimizer::lexPragma(const char &First, const char const End) {		bool Minimizer::lexPragma(const char &First, const char const End) {
// #pragma.		// #pragma.
▲ Show 20 Lines • Show All 158 Lines • ▼ Show 20 Lines	while (First != End)
if (lexPPLine(First, End))		if (lexPPLine(First, End))
return true;		return true;
return false;		return false;
}		}

bool Minimizer::minimize() {		bool Minimizer::minimize() {
bool Error = minimizeImpl(Input.begin(), Input.end());		bool Error = minimizeImpl(Input.begin(), Input.end());

		assert(Out.size() <= Input.size() && "Output is not larger than input");

if (!Error) {		if (!Error) {
// Add a trailing newline and an EOF on success.		// Add a trailing newline if it won't make the output larger than the input.
if (!Out.empty() && Out.back() != '\n')		if (!Out.empty() && Out.back() != '\n' && Out.size() < Input.size())
Out.push_back('\n');		Out.push_back('\n');
makeToken(pp_eof);		makeToken(pp_eof);
}		}

// Null-terminate the output. This way the memory buffer that's passed to		// Null-terminate the output. This way the memory buffer that's passed to
// Clang will not have to worry about the terminating '\0'.		// Clang will not have to worry about the terminating '\0'.
Out.push_back(0);		Out.push_back(0);
Out.pop_back();		Out.pop_back();
▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines

clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp

Show First 20 Lines • Show All 88 Lines • ▼ Show 20 Lines
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, Define) {		TEST(MinimizeSourceToDependencyDirectivesTest, Define) {
SmallVector<char, 128> Out;		SmallVector<char, 128> Out;
SmallVector<Token, 4> Tokens;		SmallVector<Token, 4> Tokens;

ASSERT_FALSE(		ASSERT_FALSE(
minimizeSourceToDependencyDirectives("#define MACRO", Out, Tokens));		minimizeSourceToDependencyDirectives("#define MACRO", Out, Tokens));
EXPECT_STREQ("#define MACRO\n", Out.data());		EXPECT_STREQ("#define MACRO", Out.data());
ASSERT_EQ(2u, Tokens.size());		ASSERT_EQ(2u, Tokens.size());
ASSERT_EQ(pp_define, Tokens.front().K);		ASSERT_EQ(pp_define, Tokens.front().K);
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, DefineSpacing) {		TEST(MinimizeSourceToDependencyDirectivesTest, DefineSpacing) {
SmallVector<char, 128> Out;		SmallVector<char, 128> Out;

ASSERT_FALSE(		ASSERT_FALSE(
Show All 12 Lines	ASSERT_FALSE(
minimizeSourceToDependencyDirectives("#define MACRO\n\n\n", Out));		minimizeSourceToDependencyDirectives("#define MACRO\n\n\n", Out));
EXPECT_STREQ("#define MACRO\n", Out.data());		EXPECT_STREQ("#define MACRO\n", Out.data());
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, DefineMacroArguments) {		TEST(MinimizeSourceToDependencyDirectivesTest, DefineMacroArguments) {
SmallVector<char, 128> Out;		SmallVector<char, 128> Out;

ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO()", Out));		ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO()", Out));
EXPECT_STREQ("#define MACRO()\n", Out.data());		EXPECT_STREQ("#define MACRO()", Out.data());

ASSERT_FALSE(		ASSERT_FALSE(
minimizeSourceToDependencyDirectives("#define MACRO(a, b...)", Out));		minimizeSourceToDependencyDirectives("#define MACRO(a, b...)", Out));
EXPECT_STREQ("#define MACRO(a,b...)\n", Out.data());		EXPECT_STREQ("#define MACRO(a,b...)\n", Out.data());

ASSERT_FALSE(		ASSERT_FALSE(
minimizeSourceToDependencyDirectives("#define MACRO content", Out));		minimizeSourceToDependencyDirectives("#define MACRO content", Out));
EXPECT_STREQ("#define MACRO content\n", Out.data());		EXPECT_STREQ("#define MACRO content", Out.data());

ASSERT_FALSE(minimizeSourceToDependencyDirectives(		ASSERT_FALSE(minimizeSourceToDependencyDirectives(
"#define MACRO con tent ", Out));		"#define MACRO con tent ", Out));
EXPECT_STREQ("#define MACRO con tent\n", Out.data());		EXPECT_STREQ("#define MACRO con tent\n", Out.data());

ASSERT_FALSE(minimizeSourceToDependencyDirectives(		ASSERT_FALSE(minimizeSourceToDependencyDirectives(
"#define MACRO() con tent ", Out));		"#define MACRO() con tent ", Out));
EXPECT_STREQ("#define MACRO() con tent\n", Out.data());		EXPECT_STREQ("#define MACRO() con tent\n", Out.data());
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) {		TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) {
SmallVector<char, 128> Out;		SmallVector<char, 128> Out;

ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out));		ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out));
EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());		EXPECT_STREQ("#define MACRO\n", Out.data());
		dexonsmithUnsubmitted Not Done Reply Inline Actions I'm not comfortable with this change... the idea of the `(/invalid/` was to guarantee that an invalid macro argument list didn't get minimized to something valid. Perhaps the result could be `"#define MACRO(\n"`, and just strip the comment? dexonsmith: I'm not comfortable with this change... the idea of the `(/invalid/` was to guarantee that an…

ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out));		ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out));
EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());		EXPECT_STREQ("#define MACRO\n", Out.data());

ASSERT_FALSE(		ASSERT_FALSE(
minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out));		minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out));
EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());		EXPECT_STREQ("#define MACRO\n", Out.data());
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) {		TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) {
SmallVector<char, 128> Out;		SmallVector<char, 128> Out;

ASSERT_FALSE(minimizeSourceToDependencyDirectives(		ASSERT_FALSE(minimizeSourceToDependencyDirectives(
"#define MACRO(\t)\tcon \t tent\t", Out));		"#define MACRO(\t)\tcon \t tent\t", Out));
EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data());		EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data());
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoName) {

ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out));		ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out));
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) {		TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) {
SmallVector<char, 128> Out;		SmallVector<char, 128> Out;

ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out));		ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out));
EXPECT_STREQ("#define AND &\n", Out.data());		EXPECT_STREQ("#define AND&\n", Out.data());

ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n"		ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n"
"&\n",		"&\n",
Out));		Out));
EXPECT_STREQ("#define AND &\n", Out.data());		EXPECT_STREQ("#define AND &\n", Out.data());
}		}

TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) {		TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) {
▲ Show 20 Lines • Show All 550 Lines • Show Last 20 Lines