This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lld/
-
ELF/
-
LinkerScript.cpp
-
ScriptLexer.h
1
ScriptLexer.cpp
-
test/ELF/linkerscript/
-
ELF/
-
linkerscript/
1
operators.s

Differential D29963

Apply different tokenization rules to linker script expressions.
ClosedPublic

Authored by ruiu on Feb 14 2017, 2:18 PM.

Download Raw Diff

Details

Reviewers

silvas
grimar

Commits

rG731a66ae9851: Apply different tokenization rules to linker script expressions.
rLLD295225: Apply different tokenization rules to linker script expressions.
rL295225: Apply different tokenization rules to linker script expressions.

Summary

The linker script lexer is context-sensitive. In the regular context,
arithmetic operator characters are regular characters, but in the
expression context, they are independent tokens. This afects how the
lexer tokenizes "3*4", for example. (This kind of expression is real;
the Linux kernel uses it.)

This patch defines function maybeSplitExpr. This function splits the
current token into multiple expression tokens if the lexer is in the
expression context.

Diff Detail

Build Status

Buildable 3980
Build 3980: arc lint + arc unit

Event Timeline

ruiu created this revision.Feb 14 2017, 2:18 PM

:) I think that approach implements exactly what I had in mind when wrote
in "[llvm-dev] Linking Linux kernel with LLD" thread:

"I was thinking about entering some special parser state for
extracting sub tokens from tokens transparently when
we are inside code that evaluates the expression."

Looks good to me.

lld/test/ELF/linkerscript/operators.s
13	I would add another sub-case, which tests all operators you support in patch.

grimar added inline comments.Feb 15 2017, 7:54 AM

lld/ELF/ScriptLexer.cpp
187	May be just: E = std::max(E, 1); Ret.push_back(S.substr(0, E)); S = S.substr(E); }

Closed by commit rL295225: Apply different tokenization rules to linker script expressions. (authored by ruiu). · Explain WhyFeb 15 2017, 12:09 PM

This revision was automatically updated to reflect the committed changes.

grimar mentioned this in D29576: [ELF] - Change tokenizer to read tokens "on fly"..Feb 16 2017, 12:32 AM

tpimh added a subscriber: tpimh.Feb 16 2017, 2:00 AM

Revision Contents

Path

Size

lld/

ELF/

LinkerScript.cpp

14 lines

ScriptLexer.h

2 lines

ScriptLexer.cpp

69 lines

test/

ELF/

linkerscript/

operators.s

2 lines

Diff 88436

lld/ELF/LinkerScript.cpp

Show First 20 Lines • Show All 1,604 Lines • ▼ Show 20 Lines	static bool isAbsolute(StringRef S) {
return ScriptBase->isAbsolute(S);		return ScriptBase->isAbsolute(S);
}		}

SymbolAssignment *ScriptParser::readAssignment(StringRef Name) {		SymbolAssignment *ScriptParser::readAssignment(StringRef Name) {
StringRef Op = next();		StringRef Op = next();
Expr E;		Expr E;
assert(Op == "=" \|\| Op == "+=");		assert(Op == "=" \|\| Op == "+=");
if (consume("ABSOLUTE")) {		if (consume("ABSOLUTE")) {
// The RHS may be something like "ABSOLUTE(.) & 0xff".		E = readExpr();
// Call readExpr1 to read the whole expression.
E = readExpr1(readParenExpr(), 0);
E.IsAbsolute = [] { return true; };		E.IsAbsolute = [] { return true; };
} else {		} else {
E = readExpr();		E = readExpr();
}		}
if (Op == "+=") {		if (Op == "+=") {
std::string Loc = getCurrentLocation();		std::string Loc = getCurrentLocation();
E = [=](uint64_t Dot) {		E = [=](uint64_t Dot) {
return getSymbolValue(Loc, Name, Dot) + E(Dot);		return getSymbolValue(Loc, Name, Dot) + E(Dot);
};		};
}		}
return new SymbolAssignment(Name, E);		return new SymbolAssignment(Name, E);
}		}

// This is an operator-precedence parser to parse a linker		// This is an operator-precedence parser to parse a linker
// script expression.		// script expression.
Expr ScriptParser::readExpr() { return readExpr1(readPrimary(), 0); }		Expr ScriptParser::readExpr() {
		// Our lexer is context-aware. Set the in-expression bit so that
		// they apply different tokenization rules.
		bool Orig = InExpr;
		InExpr = true;
		Expr E = readExpr1(readPrimary(), 0);
		InExpr = Orig;
		return E;
		}

static Expr combine(StringRef Op, Expr L, Expr R) {		static Expr combine(StringRef Op, Expr L, Expr R) {
auto IsAbs = [=] { return L.IsAbsolute() && R.IsAbsolute(); };		auto IsAbs = [=] { return L.IsAbsolute() && R.IsAbsolute(); };
auto GetOutSec = [=] {		auto GetOutSec = [=] {
const OutputSectionBase *S = L.Section();		const OutputSectionBase *S = L.Section();
return S ? S : R.Section();		return S ? S : R.Section();
};		};

▲ Show 20 Lines • Show All 484 Lines • Show Last 20 Lines

lld/ELF/ScriptLexer.h

Show All 30 Lines	public:
StringRef peek(unsigned N = 0);		StringRef peek(unsigned N = 0);
void skip();		void skip();
bool consume(StringRef Tok);		bool consume(StringRef Tok);
void expect(StringRef Expect);		void expect(StringRef Expect);
std::string getCurrentLocation();		std::string getCurrentLocation();

std::vector<MemoryBufferRef> MBs;		std::vector<MemoryBufferRef> MBs;
std::vector<StringRef> Tokens;		std::vector<StringRef> Tokens;
		bool InExpr = false;
size_t Pos = 0;		size_t Pos = 0;
bool Error = false;		bool Error = false;

private:		private:
		void maybeSplitExpr();
StringRef getLine();		StringRef getLine();
size_t getLineNumber();		size_t getLineNumber();
size_t getColumnNumber();		size_t getColumnNumber();

MemoryBufferRef getCurrentMB();		MemoryBufferRef getCurrentMB();
};		};

} // namespace elf		} // namespace elf
} // namespace lld		} // namespace lld

#endif		#endif

lld/ELF/ScriptLexer.cpp

Show All 20 Lines
// in various corner cases. We do not care much about efficiency because		// in various corner cases. We do not care much about efficiency because
// the time spent in parsing linker scripts is usually negligible.		// the time spent in parsing linker scripts is usually negligible.
//		//
// Our grammar of the linker script is LL(2), meaning that it needs at		// Our grammar of the linker script is LL(2), meaning that it needs at
// most two-token lookahead to parse. The only place we need two-token		// most two-token lookahead to parse. The only place we need two-token
// lookahead is labels in version scripts, where we need to parse "local :"		// lookahead is labels in version scripts, where we need to parse "local :"
// as if "local:".		// as if "local:".
//		//
// Overall, this lexer works fine for most linker scripts. There's room		// Overall, this lexer works fine for most linker scripts. There might
// for improving compatibility, but that's probably not at the top of our		// be room for improving compatibility, but that's probably not at the
// todo list.		// top of our todo list.
//
// A caveat: This lexer splits an input string into tokens ahead of time,
// so the lexer is not context aware. There's one known corner case. Let's
// say the next string is "val*3" (without quotes). In the context where
// the parser is expecting an expression, that should be tokenizes to
// "val", "*" and "3". In other context, it should be just a single
// token. (If it is in a filename context, it'll be interpeted as a glob
// pattern, for example.) We want to fix this, but it probably needs a
// redesign of this lexer.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "ScriptLexer.h"		#include "ScriptLexer.h"
#include "Error.h"		#include "Error.h"
#include "llvm/ADT/Twine.h"		#include "llvm/ADT/Twine.h"

using namespace llvm;		using namespace llvm;
▲ Show 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	for (;;) {
if (S.size() == Size)		if (S.size() == Size)
return S;		return S;
}		}
}		}

// An erroneous token is handled as if it were the last token before EOF.		// An erroneous token is handled as if it were the last token before EOF.
bool ScriptLexer::atEOF() { return Error \|\| Tokens.size() == Pos; }		bool ScriptLexer::atEOF() { return Error \|\| Tokens.size() == Pos; }

		// Split a given string as an expression.
		// This function returns "3", "" and "5" for "35" for example.
		static std::vector<StringRef> tokenizeExpr(StringRef S) {
		StringRef Ops = "+-*/"; // List of operators

		// Quoted strings are literal strings, so we don't want to split it.
		if (S.startswith("\""))
		return {S};

		// Split S with +-*/ as separators.
		std::vector<StringRef> Ret;
		while (!S.empty()) {
		size_t E = S.find_first_of(Ops);
		if (E == StringRef::npos) {
		Ret.push_back(S);
		break;
		}

		if (E != 0) {
		grimarUnsubmitted Not Done Reply Inline Actions May be just: E = std::max(E, 1); Ret.push_back(S.substr(0, E)); S = S.substr(E); } grimar: May be just: ``` E = std::max(E, 1); Ret.push_back(S.substr(0, E)); S = S.substr(E); }…
		Ret.push_back(S.substr(0, E));
		S = S.substr(E);
		continue;
		}

		Ret.push_back(S.substr(0, 1));
		S = S.substr(1);
		}
		return Ret;
		}

		// In contexts where expressions are expected, the lexer should apply
		// different tokenization rules than the default one. By default,
		// arithmetic operator characters are regular characters, but in the
		// expression context, they should be independent tokens.
		//
		// For example, "foo3" should be tokenized to "foo", "" and "3" only
		// in the expression context.
		//
		// This function may split the current token into multiple tokens.
		void ScriptLexer::maybeSplitExpr() {
		if (!InExpr \|\| Error \|\| atEOF())
		return;

		std::vector<StringRef> V = tokenizeExpr(Tokens[Pos]);
		if (V.size() == 1)
		return;
		Tokens.erase(Tokens.begin() + Pos);
		Tokens.insert(Tokens.begin() + Pos, V.begin(), V.end());
		}

StringRef ScriptLexer::next() {		StringRef ScriptLexer::next() {
		maybeSplitExpr();

if (Error)		if (Error)
return "";		return "";
if (atEOF()) {		if (atEOF()) {
setError("unexpected EOF");		setError("unexpected EOF");
return "";		return "";
}		}
return Tokens[Pos++];		return Tokens[Pos++];
}		}

StringRef ScriptLexer::peek(unsigned N) {		StringRef ScriptLexer::peek(unsigned N) {
		maybeSplitExpr();

StringRef Tok;		StringRef Tok;
for (unsigned I = 0; I <= N; ++I) {		for (unsigned I = 0; I <= N; ++I) {
Tok = next();		Tok = next();
if (Error)		if (Error)
return "";		return "";
}		}
Pos = Pos - N - 1;		Pos = Pos - N - 1;
return Tok;		return Tok;
Show All 36 Lines

lld/test/ELF/linkerscript/operators.s

	# REQUIRES: x86			# REQUIRES: x86
	# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t			# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
	# RUN: echo "SECTIONS { \			# RUN: echo "SECTIONS { \
	# RUN: . = 0xFFF0; \			# RUN: . = 0xFFF0; \
	# RUN: . = . + 0x10; \			# RUN: . = . + 0x10; \
	# RUN: .plus : { *(.plus) } \			# RUN: .plus : { *(.plus) } \
	# RUN: . = 0x11010 - 0x10; \			# RUN: . = 0x11010 - 0x10; \
	# RUN: .minus : { *(.minus) } \			# RUN: .minus : { *(.minus) } \
	# RUN: . = 0x24000 / 0x2; \			# RUN: . = 0x24000 / 0x2; \
	# RUN: .div : { *(.div) } \			# RUN: .div : { *(.div) } \
	# RUN: . = 0x11000 + 0x1000 * 0x2; \			# RUN: . = 0x11000 + 0x1000 * 0x2; \
	# RUN: .mul : { *(.mul) } \			# RUN: .mul : { *(.mul) } \
	# RUN: . = 0x10000 + (0x1000 + 0x1000) * 0x2; \			# RUN: . = 0x10000+(0x1000+0x1000)*0x2; \
				grimarUnsubmitted Not Done Reply Inline Actions I would add another sub-case, which tests all operators you support in patch. grimar: I would add another sub-case, which tests all operators you support in patch.
	# RUN: .bracket : { *(.bracket) } \			# RUN: .bracket : { *(.bracket) } \
	# RUN: . = 0x17000 & 0x15000; \			# RUN: . = 0x17000 & 0x15000; \
	# RUN: .and : { *(.and) } \			# RUN: .and : { *(.and) } \
	# RUN: . = 0x1 ? 0x16000 : 0x999999; \			# RUN: . = 0x1 ? 0x16000 : 0x999999; \
	# RUN: .ternary1 : { *(.ternary1) } \			# RUN: .ternary1 : { *(.ternary1) } \
	# RUN: . = 0x0 ? 0x999999 : 0x17000; \			# RUN: . = 0x0 ? 0x999999 : 0x17000; \
	# RUN: .ternary2 : { *(.ternary2) } \			# RUN: .ternary2 : { *(.ternary2) } \
	# RUN: . = 0x0 < 0x1 ? 0x18000 : 0x999999; \			# RUN: . = 0x0 < 0x1 ? 0x18000 : 0x999999; \
	▲ Show 20 Lines • Show All 168 Lines • Show Last 20 Lines