From 939b82d11279ee2700c19c41a770051cc28e045f Mon Sep 17 00:00:00 2001 From: Joel Severin Date: Tue, 26 Mar 2024 12:37:11 +0100 Subject: [PATCH] Hack patch to allow GNU ld style linker scripts in wasm-ld --- lld/wasm/CMakeLists.txt | 2 + lld/wasm/Config.h | 1 + lld/wasm/Driver.cpp | 30 +- lld/wasm/InputFiles.cpp | 4 + lld/wasm/Options.td | 2 + lld/wasm/ScriptLexer.cpp | 328 ++++++++++++ lld/wasm/ScriptLexer.h | 56 ++ lld/wasm/ScriptParser.cpp | 1056 +++++++++++++++++++++++++++++++++++++ lld/wasm/ScriptParser.h | 341 ++++++++++++ lld/wasm/Writer.cpp | 241 ++++++++- 10 files changed, 2051 insertions(+), 10 deletions(-) create mode 100644 lld/wasm/ScriptLexer.cpp create mode 100644 lld/wasm/ScriptLexer.h create mode 100644 lld/wasm/ScriptParser.cpp create mode 100644 lld/wasm/ScriptParser.h diff --git a/lld/wasm/CMakeLists.txt b/lld/wasm/CMakeLists.txt index 6033bfbf9..53048d818 100644 --- a/lld/wasm/CMakeLists.txt +++ b/lld/wasm/CMakeLists.txt @@ -12,6 +12,8 @@ add_lld_library(lldWasm OutputSections.cpp OutputSegment.cpp Relocations.cpp + ScriptLexer.cpp + ScriptParser.cpp SymbolTable.cpp Symbols.cpp SyntheticSections.cpp diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index 97c508bda..e42ffdb94 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -57,6 +57,7 @@ struct Configuration { bool growableTable; bool gcSections; llvm::StringSet<> keepSections; + std::optional linkerScript; std::optional> memoryImport; std::optional memoryExport; bool sharedMemory; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 635f19f78..af849f9b7 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -327,6 +327,15 @@ static std::optional findFromSearchPaths(StringRef path) { return std::nullopt; } +// If a linker/version script doesn't exist in the current directory, we also +// look for the script in the '-L' search paths. This matches the behaviour of +// '-T', --version-script=, and linker script INPUT() command in ld.bfd. +static std::optional searchScript(StringRef name) { + if (fs::exists(name)) + return name.str(); + return findFromSearchPaths(name); +} + // This is for -l. We'll look for lib.a from // search paths. static std::optional searchLibraryBaseName(StringRef name) { @@ -388,6 +397,13 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { error("stray --end-lib"); inLib = false; break; + case OPT_script: + if (std::optional path = searchScript(arg->getValue())) { + config->linkerScript = readFile(*path); + } else { + error(Twine("cannot find linker script ") + arg->getValue()); + } + break; } } if (files.empty() && errorCount() == 0) @@ -617,12 +633,6 @@ static void setConfigs() { // pointer. if (!config->tableBase) config->tableBase = 1; - // The default offset for static/global data, for when --global-base is - // not specified on the command line. The precise value of 1024 is - // somewhat arbitrary, and pre-dates wasm-ld (Its the value that - // emscripten used prior to wasm-ld). - if (!config->globalBase && !config->relocatable && !config->stackFirst) - config->globalBase = 1024; } if (config->relocatable) { @@ -1195,6 +1205,14 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (errorCount()) return; + // The default offset for static/global data, for when --global-base is + // not specified on the command line. The precise value of 1024 is + // somewhat arbitrary, and pre-dates wasm-ld (Its the value that + // emscripten used prior to wasm-ld). + if (!config->globalBase && !config->linkerScript && !ctx.isPic && + !config->relocatable && !config->stackFirst) + config->globalBase = 1024; + checkOptions(args); if (errorCount()) return; diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp index f5e946aca..db768fe63 100644 --- a/lld/wasm/InputFiles.cpp +++ b/lld/wasm/InputFiles.cpp @@ -362,6 +362,10 @@ static bool shouldMerge(const WasmSection &sec) { } static bool shouldMerge(const WasmSegment &seg) { + // No merge chunks when using linker scripts. + if (config->linkerScript) + return false; + // As of now we only support merging strings, and only with single byte // alignment (2^0). if (!(seg.Data.LinkingFlags & WASM_SEG_FLAG_STRINGS) || diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 8190717ce..bc1cd6b74 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -294,6 +294,8 @@ def thinlto_jobs: JJ<"thinlto-jobs=">, def lto_debug_pass_manager: FF<"lto-debug-pass-manager">, HelpText<"Debug new pass manager">; +defm script: Eq<"script", "Use linker script ((very) limited support for GNU ld/ELF linker scripts)">; + // Experimental PIC mode. def experimental_pic: FF<"experimental-pic">, HelpText<"Enable Experimental PIC">; diff --git a/lld/wasm/ScriptLexer.cpp b/lld/wasm/ScriptLexer.cpp new file mode 100644 index 000000000..49fb05a2e --- /dev/null +++ b/lld/wasm/ScriptLexer.cpp @@ -0,0 +1,328 @@ +//===- ScriptLexer.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines a lexer for the linker script. +// +// The linker script's grammar is not complex but ambiguous due to the +// lack of the formal specification of the language. What we are trying to +// do in this and other files in LLD is to make a "reasonable" linker +// script processor. +// +// Among simplicity, compatibility and efficiency, we put the most +// emphasis on simplicity when we wrote this lexer. Compatibility with the +// GNU linkers is important, but we did not try to clone every tiny corner +// case of their lexers, as even ld.bfd and ld.gold are subtly different +// in various corner cases. We do not care much about efficiency because +// the time spent in parsing linker scripts is usually negligible. +// +// Our grammar of the linker script is LL(2), meaning that it needs at +// most two-token lookahead to parse. The only place we need two-token +// lookahead is labels in version scripts, where we need to parse "local :" +// as if "local:". +// +// Overall, this lexer works fine for most linker scripts. There might +// be room for improving compatibility, but that's probably not at the +// top of our todo list. +// +//===----------------------------------------------------------------------===// + +#include "ScriptLexer.h" +#include "lld/Common/ErrorHandler.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/ErrorHandling.h" +#include + +using namespace llvm; +using namespace lld; +using namespace lld::wasm; + +// Returns a whole line containing the current token. +StringRef ScriptLexer::getLine() { + StringRef s = getCurrentMB().getBuffer(); + StringRef tok = tokens[pos - 1]; + + size_t pos = s.rfind('\n', tok.data() - s.data()); + if (pos != StringRef::npos) + s = s.substr(pos + 1); + return s.substr(0, s.find_first_of("\r\n")); +} + +// Returns 1-based line number of the current token. +size_t ScriptLexer::getLineNumber() { + if (pos == 0) + return 1; + StringRef s = getCurrentMB().getBuffer(); + StringRef tok = tokens[pos - 1]; + const size_t tokOffset = tok.data() - s.data(); + + // For the first token, or when going backwards, start from the beginning of + // the buffer. If this token is after the previous token, start from the + // previous token. + size_t line = 1; + size_t start = 0; + if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) { + start = lastLineNumberOffset; + line = lastLineNumber; + } + + line += s.substr(start, tokOffset - start).count('\n'); + + // Store the line number of this token for reuse. + lastLineNumberOffset = tokOffset; + lastLineNumber = line; + + return line; +} + +// Returns 0-based column number of the current token. +size_t ScriptLexer::getColumnNumber() { + StringRef tok = tokens[pos - 1]; + return tok.data() - getLine().data(); +} + +std::string ScriptLexer::getCurrentLocation() { + std::string filename = std::string(getCurrentMB().getBufferIdentifier()); + return (filename + ":" + Twine(getLineNumber())).str(); +} + +ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); } + +// We don't want to record cascading errors. Keep only the first one. +void ScriptLexer::setError(const Twine &msg) { + if (errorCount()) + return; + + std::string s = (getCurrentLocation() + ": " + msg).str(); + if (pos) + s += "\n>>> " + getLine().str() + "\n>>> " + + std::string(getColumnNumber(), ' ') + "^"; + error(s); +} + +// Split S into linker script tokens. +void ScriptLexer::tokenize(MemoryBufferRef mb) { + std::vector vec; + mbs.push_back(mb); + StringRef s = mb.getBuffer(); + StringRef begin = s; + + for (;;) { + s = skipSpace(s); + if (s.empty()) + break; + + // Quoted token. Note that double-quote characters are parts of a token + // because, in a glob match context, only unquoted tokens are interpreted + // as glob patterns. Double-quoted tokens are literal patterns in that + // context. + if (s.starts_with("\"")) { + size_t e = s.find("\"", 1); + if (e == StringRef::npos) { + StringRef filename = mb.getBufferIdentifier(); + size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n'); + error(filename + ":" + Twine(lineno + 1) + ": unclosed quote"); + return; + } + + vec.push_back(s.take_front(e + 1)); + s = s.substr(e + 1); + continue; + } + + // Some operators form separate tokens. + if (s.starts_with("<<=") || s.starts_with(">>=")) { + vec.push_back(s.substr(0, 3)); + s = s.substr(3); + continue; + } + if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&|", s[0])) || + (s[0] == s[1] && strchr("<>&|", s[0])))) { + vec.push_back(s.substr(0, 2)); + s = s.substr(2); + continue; + } + + // Unquoted token. This is more relaxed than tokens in C-like language, + // so that you can write "file-name.cpp" as one bare token, for example. + size_t pos = s.find_first_not_of( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789_.$/\\~=+[]*?-!^:"); + + // A character that cannot start a word (which is usually a + // punctuation) forms a single character token. + if (pos == 0) + pos = 1; + vec.push_back(s.substr(0, pos)); + s = s.substr(pos); + } + + tokens.insert(tokens.begin() + pos, vec.begin(), vec.end()); +} + +// Skip leading whitespace characters or comments. +StringRef ScriptLexer::skipSpace(StringRef s) { + for (;;) { + if (s.starts_with("/*")) { + size_t e = s.find("*/", 2); + if (e == StringRef::npos) { + setError("unclosed comment in a linker script"); + return ""; + } + s = s.substr(e + 2); + continue; + } + if (s.starts_with("#")) { + size_t e = s.find('\n', 1); + if (e == StringRef::npos) + e = s.size() - 1; + s = s.substr(e + 1); + continue; + } + size_t size = s.size(); + s = s.ltrim(); + if (s.size() == size) + return s; + } +} + +// An erroneous token is handled as if it were the last token before EOF. +bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; } + +// Split a given string as an expression. +// This function returns "3", "*" and "5" for "3*5" for example. +static std::vector tokenizeExpr(StringRef s) { + StringRef ops = "!~*/+-<>?:="; // List of operators + + // Quoted strings are literal strings, so we don't want to split it. + if (s.starts_with("\"")) + return {s}; + + // Split S with operators as separators. + std::vector ret; + while (!s.empty()) { + size_t e = s.find_first_of(ops); + + // No need to split if there is no operator. + if (e == StringRef::npos) { + ret.push_back(s); + break; + } + + // Get a token before the operator. + if (e != 0) + ret.push_back(s.substr(0, e)); + + // Get the operator as a token. + // Keep !=, ==, >=, <=, << and >> operators as a single tokens. + if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") || + s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") || + s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) { + ret.push_back(s.substr(e, 2)); + s = s.substr(e + 2); + } else { + ret.push_back(s.substr(e, 1)); + s = s.substr(e + 1); + } + } + return ret; +} + +// In contexts where expressions are expected, the lexer should apply +// different tokenization rules than the default one. By default, +// arithmetic operator characters are regular characters, but in the +// expression context, they should be independent tokens. +// +// For example, "foo*3" should be tokenized to "foo", "*" and "3" only +// in the expression context. +// +// This function may split the current token into multiple tokens. +void ScriptLexer::maybeSplitExpr() { + if (!inExpr || errorCount() || atEOF()) + return; + + std::vector v = tokenizeExpr(tokens[pos]); + if (v.size() == 1) + return; + tokens.erase(tokens.begin() + pos); + tokens.insert(tokens.begin() + pos, v.begin(), v.end()); +} + +StringRef ScriptLexer::next() { + maybeSplitExpr(); + + if (errorCount()) + return ""; + if (atEOF()) { + setError("unexpected EOF"); + return ""; + } + return tokens[pos++]; +} + +StringRef ScriptLexer::peek() { + StringRef tok = next(); + if (errorCount()) + return ""; + pos = pos - 1; + return tok; +} + +StringRef ScriptLexer::peek2() { + skip(); + StringRef tok = next(); + if (errorCount()) + return ""; + pos = pos - 2; + return tok; +} + +bool ScriptLexer::consume(StringRef tok) { + if (peek() == tok) { + skip(); + return true; + } + return false; +} + +// Consumes Tok followed by ":". Space is allowed between Tok and ":". +bool ScriptLexer::consumeLabel(StringRef tok) { + if (consume((tok + ":").str())) + return true; + if (tokens.size() >= pos + 2 && tokens[pos] == tok && + tokens[pos + 1] == ":") { + pos += 2; + return true; + } + return false; +} + +void ScriptLexer::skip() { (void)next(); } + +void ScriptLexer::expect(StringRef expect) { + if (errorCount()) + return; + StringRef tok = next(); + if (tok != expect) + setError(expect + " expected, but got " + tok); +} + +// Returns true if S encloses T. +static bool encloses(StringRef s, StringRef t) { + return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); +} + +MemoryBufferRef ScriptLexer::getCurrentMB() { + // Find input buffer containing the current token. + assert(!mbs.empty()); + if (pos == 0) + return mbs.back(); + for (MemoryBufferRef mb : mbs) + if (encloses(mb.getBuffer(), tokens[pos - 1])) + return mb; + llvm_unreachable("getCurrentMB: failed to find a token"); +} diff --git a/lld/wasm/ScriptLexer.h b/lld/wasm/ScriptLexer.h new file mode 100644 index 000000000..33e2bbd05 --- /dev/null +++ b/lld/wasm/ScriptLexer.h @@ -0,0 +1,56 @@ +//===- ScriptLexer.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_WASM_SCRIPT_LEXER_H +#define LLD_WASM_SCRIPT_LEXER_H + +#include "lld/Common/LLVM.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/MemoryBufferRef.h" +#include + +namespace lld::wasm { + +class ScriptLexer { +public: + explicit ScriptLexer(MemoryBufferRef mb); + + void setError(const Twine &msg); + void tokenize(MemoryBufferRef mb); + StringRef skipSpace(StringRef s); + bool atEOF(); + StringRef next(); + StringRef peek(); + StringRef peek2(); + void skip(); + bool consume(StringRef tok); + void expect(StringRef expect); + bool consumeLabel(StringRef tok); + std::string getCurrentLocation(); + + std::vector mbs; + std::vector tokens; + bool inExpr = false; + size_t pos = 0; + + size_t lastLineNumber = 0; + size_t lastLineNumberOffset = 0; + +protected: + MemoryBufferRef getCurrentMB(); + +private: + void maybeSplitExpr(); + StringRef getLine(); + size_t getLineNumber(); + size_t getColumnNumber(); +}; + +} // namespace lld::wasm + +#endif diff --git a/lld/wasm/ScriptParser.cpp b/lld/wasm/ScriptParser.cpp new file mode 100644 index 000000000..4f246f85a --- /dev/null +++ b/lld/wasm/ScriptParser.cpp @@ -0,0 +1,1056 @@ +//===- ScriptParser.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a recursive-descendent parser for linker scripts. +// Parsed results are stored to Config and Script global objects. +// +//===----------------------------------------------------------------------===// + +#include "ScriptParser.h" +#include "OutputSections.h" +#include "OutputSegment.h" +#include "ScriptLexer.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "lld/Common/CommonLinkerContext.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/SaveAndRestore.h" +#include "llvm/Support/TimeProfiler.h" +#include +#include +#include + +using namespace llvm; +using namespace llvm::support::endian; +using namespace lld; +using namespace lld::wasm; + +static StringRef unquote(StringRef s) { + if (s.starts_with("\"")) + return s.substr(1, s.size() - 2); + return s; +} + +// Some operations only support one non absolute value. Move the +// absolute one to the right hand side for convenience. +static void moveAbsRight(ExprValue &a, ExprValue &b) { + if (a.sec == nullptr || (a.forceAbsolute && !b.isAbsolute())) + std::swap(a, b); + if (!b.isAbsolute()) + error(a.loc + ": at least one side of the expression must be absolute"); +} + +static ExprValue add(ExprValue a, ExprValue b) { + moveAbsRight(a, b); + return {a.sec, a.forceAbsolute, a.getSectionOffset() + b.getValue(), a.loc}; +} + +static ExprValue sub(ExprValue a, ExprValue b) { + // The distance between two symbols in sections is absolute. + if (!a.isAbsolute() && !b.isAbsolute()) + return a.getValue() - b.getValue(); + return {a.sec, false, a.getSectionOffset() - b.getValue(), a.loc}; +} + +static ExprValue bitAnd(ExprValue a, ExprValue b) { + moveAbsRight(a, b); + return {a.sec, a.forceAbsolute, + (a.getValue() & b.getValue()) - a.getSecAddr(), a.loc}; +} + +static ExprValue bitOr(ExprValue a, ExprValue b) { + moveAbsRight(a, b); + return {a.sec, a.forceAbsolute, + (a.getValue() | b.getValue()) - a.getSecAddr(), a.loc}; +} + +uint64_t ExprValue::getValue() const { + if (sec) + return alignToPowerOf2(sec->address + sec->getOffset(val), + alignment); + return alignToPowerOf2(val, alignment); +} + +uint64_t ExprValue::getSecAddr() const { + return sec ? sec->address + sec->getOffset(0) : 0; +} + +uint64_t ExprValue::getSectionOffset() const { + // If the alignment is trivial, we don't have to compute the full + // value to know the offset. This allows this function to succeed in + // cases where the output section is not yet known. + if (alignment == 1 && !sec) + return val; + return getValue() - getSecAddr(); +} + +void ScriptParser::readLinkerScript() { + while (!atEOF()) { + StringRef tok = next(); + if (tok == ";") + continue; + + if (tok == "SECTIONS") { + readSections(); + } else if (SymbolAssignment *cmd = readAssignment(tok)) { + sectionCommands.push_back(cmd); + } else { + setError("unknown directive: " + tok); + } + } +} + +void ScriptParser::readSections() { + expect("{"); + SmallVector v; + while (!errorCount() && !consume("}")) { + StringRef tok = next(); + if (tok == "OVERLAY") { + setError("OVERLAY not supported"); + continue; + } + + if (SectionCommand *cmd = readAssignment(tok)) + v.push_back(cmd); + else + v.push_back(readOutputSectionDescription(tok)); + } + + // If DATA_SEGMENT_RELRO_END is absent, for sections after DATA_SEGMENT_ALIGN, + // the relro fields should be cleared. +/* + if (!seenRelroEnd) + for (SectionCommand *cmd : v) + if (auto *osd = dyn_cast(cmd)) + osd->osec.relro = false; +*/ + sectionCommands.insert(sectionCommands.end(), v.begin(), v.end()); + + + if (atEOF() || !consume("INSERT")) { + hasSectionsCommand = true; + return; + } + + setError("INSERT BEFORE/AFTER not supported"); +} + +static int precedence(StringRef op) { + return StringSwitch(op) + .Cases("*", "/", "%", 10) + .Cases("+", "-", 9) + .Cases("<<", ">>", 8) + .Cases("<", "<=", ">", ">=", 7) + .Cases("==", "!=", 6) + .Case("&", 5) + .Case("|", 4) + .Case("&&", 3) + .Case("||", 2) + .Case("?", 1) + .Default(-1); +} + +StringMatcher ScriptParser::readFilePatterns() { + StringMatcher Matcher; + + while (!errorCount() && !consume(")")) + Matcher.addPattern(SingleStringMatcher(next())); + return Matcher; +} + +SortSectionPolicy ScriptParser::peekSortKind() { + return StringSwitch(peek()) + .Cases("SORT", "SORT_BY_NAME", SortSectionPolicy::Name) + .Case("SORT_BY_ALIGNMENT", SortSectionPolicy::Alignment) + .Case("SORT_BY_INIT_PRIORITY", SortSectionPolicy::Priority) + .Case("SORT_NONE", SortSectionPolicy::None) + .Default(SortSectionPolicy::Default); +} + +SortSectionPolicy ScriptParser::readSortKind() { + SortSectionPolicy ret = peekSortKind(); + if (ret != SortSectionPolicy::Default) + skip(); + return ret; +} + +// Reads SECTIONS command contents in the following form: +// +// ::= * +// ::= ? +// ::= "EXCLUDE_FILE" "(" + ")" +// +// For example, +// +// *(.foo EXCLUDE_FILE (a.o) .bar EXCLUDE_FILE (b.o) .baz) +// +// is parsed as ".foo", ".bar" with "a.o", and ".baz" with "b.o". +// The semantics of that is section .foo in any file, section .bar in +// any file but a.o, and section .baz in any file but b.o. +SmallVector ScriptParser::readInputSectionsList() { + SmallVector ret; + while (!errorCount() && peek() != ")") { + StringMatcher excludeFilePat; + if (consume("EXCLUDE_FILE")) { + expect("("); + excludeFilePat = readFilePatterns(); + } + + StringMatcher SectionMatcher; + // Break if the next token is ), EXCLUDE_FILE, or SORT*. + while (!errorCount() && peek() != ")" && peek() != "EXCLUDE_FILE" && + peekSortKind() == SortSectionPolicy::Default) + SectionMatcher.addPattern(unquote(next())); + + if (!SectionMatcher.empty()) + ret.push_back({std::move(excludeFilePat), std::move(SectionMatcher)}); + else if (excludeFilePat.empty()) + break; + else + setError("section pattern is expected"); + } + return ret; +} + +// Reads contents of "SECTIONS" directive. That directive contains a +// list of glob patterns for input sections. The grammar is as follows. +// +// ::= +// | "(" ")" +// | "(" "(" ")" ")" +// +// ::= "SORT" | "SORT_BY_NAME" | "SORT_BY_ALIGNMENT" +// | "SORT_BY_INIT_PRIORITY" | "SORT_NONE" +// +// is parsed by readInputSectionsList(). +InputSectionDescription * +ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags, + uint64_t withoutFlags) { + auto *cmd = + make(filePattern, withFlags, withoutFlags); + expect("("); + + while (!errorCount() && !consume(")")) { + SortSectionPolicy outer = readSortKind(); + SortSectionPolicy inner = SortSectionPolicy::Default; + SmallVector v; + if (outer != SortSectionPolicy::Default) { + expect("("); + inner = readSortKind(); + if (inner != SortSectionPolicy::Default) { + expect("("); + v = readInputSectionsList(); + expect(")"); + } else { + v = readInputSectionsList(); + } + expect(")"); + } else { + v = readInputSectionsList(); + } + + for (SectionPattern &pat : v) { + pat.sortInner = inner; + pat.sortOuter = outer; + } + + std::move(v.begin(), v.end(), std::back_inserter(cmd->sectionPatterns)); + } + return cmd; +} + +InputSectionDescription * +ScriptParser::readInputSectionDescription(StringRef tok) { + // Input section wildcard can be surrounded by KEEP. + // https://sourceware.org/binutils/docs/ld/Input-Section-Keep.html#Input-Section-Keep + uint64_t withFlags = 0; + uint64_t withoutFlags = 0; + if (tok == "KEEP") { + expect("("); + if (consume("INPUT_SECTION_FLAGS")) + setError("INPUT_SECTION_FLAGS not supported"); + InputSectionDescription *cmd = + readInputSectionRules(next(), withFlags, withoutFlags); + expect(")"); + keptSections.push_back(cmd); + return cmd; + } + if (tok == "INPUT_SECTION_FLAGS") { + setError("INPUT_SECTION_FLAGS not supported"); + tok = next(); + } + return readInputSectionRules(tok, withFlags, withoutFlags); +} + +void ScriptParser::readSort() { + expect("("); + expect("CONSTRUCTORS"); + expect(")"); +} + +Expr ScriptParser::readAssert() { + expect("("); + Expr e = readExpr(); + expect(","); + StringRef msg = unquote(next()); + expect(")"); + + return [=] { + if (!e().getValue()) + error(msg); + return dot; + }; +} + +/* +#define ECase(X) \ + { #X, X } +constexpr std::pair typeMap[] = { + ECase(SHT_PROGBITS), ECase(SHT_NOTE), ECase(SHT_NOBITS), + ECase(SHT_INIT_ARRAY), ECase(SHT_FINI_ARRAY), ECase(SHT_PREINIT_ARRAY), +}; +#undef ECase +*/ +// Tries to read the special directive for an output section definition which +// can be one of following: "(NOLOAD)", "(COPY)", "(INFO)", "(OVERLAY)", and +// "(TYPE=)". +// Tok1 and Tok2 are next 2 tokens peeked. See comment for +// readSectionAddressType below. +bool ScriptParser::readSectionDirective(SectionBase *osec, StringRef tok1, StringRef tok2) { + if (tok1 != "(") + return false; + if (tok2 != "NOLOAD" && tok2 != "COPY" && tok2 != "INFO" && + tok2 != "OVERLAY" && tok2 != "TYPE") + return false; + + expect("("); + setError("section directive " + tok2 + " currently not supported"); + if (consume("TYPE")) + { expect("="); readExpr(); } + else + skip(); + + // cmd = osec->outputSection applies below +/* if (consume("NOLOAD")) { + cmd->type = SHT_NOBITS; + cmd->typeIsSet = true; + } else if (consume("TYPE")) { + expect("="); + StringRef value = peek(); + auto it = llvm::find_if(typeMap, [=](auto e) { return e.first == value; }); + if (it != std::end(typeMap)) { + // The value is a recognized literal SHT_*. + cmd->type = it->second; + skip(); + } else if (value.starts_with("SHT_")) { + setError("unknown section type " + value); + } else { + // Otherwise, read an expression. + cmd->type = readExpr()().getValue(); + } + cmd->typeIsSet = true; + } else { + skip(); // This is "COPY", "INFO" or "OVERLAY". + cmd->nonAlloc = true; + } +*/ + expect(")"); + return true; +} + +// Reads an expression and/or the special directive for an output +// section definition. Directive is one of following: "(NOLOAD)", +// "(COPY)", "(INFO)" or "(OVERLAY)". +// +// An output section name can be followed by an address expression +// and/or directive. This grammar is not LL(1) because "(" can be +// interpreted as either the beginning of some expression or beginning +// of directive. +// +// https://sourceware.org/binutils/docs/ld/Output-Section-Address.html +// https://sourceware.org/binutils/docs/ld/Output-Section-Type.html +void ScriptParser::readSectionAddressType(SectionBase *osec) { + // Temporarily set inExpr to support TYPE= without spaces. + bool saved = std::exchange(inExpr, true); + bool isDirective = readSectionDirective(osec, peek(), peek2()); + inExpr = saved; + if (isDirective) + return; + + osec->address = readExpr()().getValue(); + setError("setting address for " + osec->name + " to " + Twine(osec->address)); + if (peek() == "(" && !readSectionDirective(osec, "(", peek2())) + setError("unknown section directive: " + peek2()); +} + +static Expr checkAlignment(Expr e, std::string &loc) { + return [=] { + uint64_t alignment = std::max((uint64_t)1, e().getValue()); + if (!isPowerOf2_64(alignment)) { + error(loc + ": alignment must be power of 2"); + return (uint64_t)1; // Return a dummy value. + } + return alignment; + }; +} + +OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) { + OutputDesc *cmd = createOutputSection(outSec, getCurrentLocation()); + SectionBase *osec = &cmd->osec; + // Maybe relro. Will reset to false if DATA_SEGMENT_RELRO_END is absent. + //osec->relro = seenDataAlign && !seenRelroEnd; + + //size_t symbolsReferenced = referencedSymbols.size(); + + if (peek() != ":") { + readSectionAddressType(osec); + } + expect(":"); + + std::string location = getCurrentLocation(); + if (consume("AT")) + //cmd->lmaExpr = readParenExpr(); + cmd->osec.address = readParenExpr()().getValue(); + if (consume("ALIGN")) + //cmd->alignExpr = checkAlignment(readParenExpr(), location); + //{ uint64_t align = checkAlignment(readParenExpr(), location)(); + // cmd->osec.address = (cmd->osec->address + (align - 1U)) & ~(align - 1U); } + setError("setting ALIGN on a section unsupported, align the dot instead"); + if (consume("SUBALIGN")) + error("SUBALIGN unsupported"); + //osec->subalignExpr = checkAlignment(readParenExpr(), location); + + // Parse constraints. + if (consume("ONLY_IF_RO")) + setError("constraints like ONLY_IF_RO unsuported"); + //osec->constraint = ConstraintKind::ReadOnly; + if (consume("ONLY_IF_RW")) + setError("constraints like ONLY_IF_RW unsuported"); + //osec->constraint = ConstraintKind::ReadWrite; + expect("{"); + + while (!errorCount() && !consume("}")) { + StringRef tok = next(); + if (tok == ";") { + // Empty commands are allowed. Do nothing here. + } else if (SymbolAssignment *assign = readAssignment(tok)) { + osec->commands.push_back(assign); + } else if (ByteCommand *data = readByteCommand(tok)) { + osec->commands.push_back(data); + } else if (tok == "CONSTRUCTORS") { + // CONSTRUCTORS is a keyword to make the linker recognize C++ ctors/dtors + // by name. This is for very old file formats such as ECOFF/XCOFF. + // For ELF, we should ignore. + } else if (tok == "FILL") { + // We handle the FILL command as an alias for =fillexp section attribute, + // which is different from what GNU linkers do. + // https://sourceware.org/binutils/docs/ld/Output-Section-Data.html + if (peek() != "(") + setError("( expected, but got " + peek()); + setError("FILL unsupported"); //osec->filler = readFill(); + } else if (tok == "SORT") { + readSort(); + } else if (tok == "INCLUDE") { + setError("INCLUDE not supported"); + } else if (tok == "(" || tok == ")") { + setError("expected filename pattern"); + } else if (peek() == "(") { + osec->commands.push_back(readInputSectionDescription(tok)); + } else { + // We have a file name and no input sections description. It is not a + // commonly used syntax, but still acceptable. In that case, all sections + // from the file will be included. + // FIXME: GNU ld permits INPUT_SECTION_FLAGS to be used here. We do not + // handle this case here as it will already have been matched by the + // case above. + auto *isd = make(tok); + isd->sectionPatterns.push_back({{}, StringMatcher("*")}); + osec->commands.push_back(isd); + } + } + + if (consume(">")) + setError("using > not supported"); + //osec->memoryRegionName = std::string(next()); + + if (consume("AT")) { + setError("using AT > not supported"); + expect(">"); + //osec->lmaRegionName = std::string(next()); + } + + //if (osec->lmaExpr && !osec->lmaRegionName.empty()) + // error("section can't have both LMA and a load region"); + + //osec->phdrs = readOutputSectionPhdrs(); + + if (peek() == "=" || peek().starts_with("=")) { + inExpr = true; + consume("="); + setError("filler unsupported"); + //osec->filler = readFill(); + inExpr = false; + } + + // Consume optional comma following output section command. + consume(","); + + //if (referencedSymbols.size() > symbolsReferenced) + // osec->expressionsUseSymbols = true; + return cmd; +} + +// Reads a `=` expression and returns its value as a big-endian number. +// https://sourceware.org/binutils/docs/ld/Output-Section-Fill.html +// We do not support using symbols in such expressions. +// +// When reading a hexstring, ld.bfd handles it as a blob of arbitrary +// size, while ld.gold always handles it as a 32-bit big-endian number. +// We are compatible with ld.gold because it's easier to implement. +// Also, we require that expressions with operators must be wrapped into +// round brackets. We did it to resolve the ambiguity when parsing scripts like: +// SECTIONS { .foo : { ... } =120+3 /DISCARD/ : { ... } } +std::array ScriptParser::readFill() { + uint64_t value = readPrimary()().val; + if (value > UINT32_MAX) + setError("filler expression result does not fit 32-bit: 0x" + + Twine::utohexstr(value)); + + std::array buf; + write32be(buf.data(), (uint32_t)value); + return buf; +} + +SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) { + expect("("); + StringRef name = next(), eq = peek(); + if (eq != "=") { + setError("= expected, but got " + next()); + while (!atEOF() && next() != ")") + ; + return nullptr; + } + SymbolAssignment *cmd = readSymbolAssignment(name); + cmd->provide = provide; + cmd->hidden = hidden; + expect(")"); + return cmd; +} + +SymbolAssignment *ScriptParser::readAssignment(StringRef tok) { + // Assert expression returns Dot, so this is equal to ".=." + if (tok == "ASSERT") + return make(".", readAssert(), getCurrentLocation()); + + size_t oldPos = pos; + SymbolAssignment *cmd = nullptr; + const StringRef op = peek(); + if (op.starts_with("=")) { + // Support = followed by an expression without whitespace. + SaveAndRestore saved(inExpr, true); + cmd = readSymbolAssignment(tok); + } else if ((op.size() == 2 && op[1] == '=' && strchr("*/+-&|", op[0])) || + op == "<<=" || op == ">>=") { + cmd = readSymbolAssignment(tok); + } else if (tok == "PROVIDE") { + SaveAndRestore saved(inExpr, true); + cmd = readProvideHidden(true, false); + } else if (tok == "HIDDEN") { + SaveAndRestore saved(inExpr, true); + cmd = readProvideHidden(false, true); + } else if (tok == "PROVIDE_HIDDEN") { + SaveAndRestore saved(inExpr, true); + cmd = readProvideHidden(true, true); + } + + if (cmd) { + cmd->commandString = + tok.str() + " " + + llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " "); + expect(";"); + } + return cmd; +} + +SymbolAssignment *ScriptParser::readSymbolAssignment(StringRef name) { + name = unquote(name); + StringRef op = next(); + assert(op == "=" || op == "*=" || op == "/=" || op == "+=" || op == "-=" || + op == "&=" || op == "|=" || op == "<<=" || op == ">>="); + // Note: GNU ld does not support %= or ^=. + Expr e = readExpr(); + if (op != "=") { + std::string loc = getCurrentLocation(); + e = [=, c = op[0]]() -> ExprValue { + ExprValue lhs = getSymbolValue(name, loc); + switch (c) { + case '*': + return lhs.getValue() * e().getValue(); + case '/': + if (uint64_t rv = e().getValue()) + return lhs.getValue() / rv; + error(loc + ": division by zero"); + return 0; + case '+': + return add(lhs, e()); + case '-': + return sub(lhs, e()); + case '<': + return lhs.getValue() << e().getValue(); + case '>': + return lhs.getValue() >> e().getValue(); + case '&': + return lhs.getValue() & e().getValue(); + case '|': + return lhs.getValue() | e().getValue(); + default: + llvm_unreachable(""); + } + }; + } + return make(name, e, getCurrentLocation()); +} + +// This is an operator-precedence parser to parse a linker +// script expression. +Expr ScriptParser::readExpr() { + // Our lexer is context-aware. Set the in-expression bit so that + // they apply different tokenization rules. + bool orig = inExpr; + inExpr = true; + Expr e = readExpr1(readPrimary(), 0); + inExpr = orig; + return e; +} + +Expr ScriptParser::combine(StringRef op, Expr l, Expr r) { + if (op == "+") + return [=] { return add(l(), r()); }; + if (op == "-") + return [=] { return sub(l(), r()); }; + if (op == "*") + return [=] { return l().getValue() * r().getValue(); }; + if (op == "/") { + std::string loc = getCurrentLocation(); + return [=]() -> uint64_t { + if (uint64_t rv = r().getValue()) + return l().getValue() / rv; + error(loc + ": division by zero"); + return 0; + }; + } + if (op == "%") { + std::string loc = getCurrentLocation(); + return [=]() -> uint64_t { + if (uint64_t rv = r().getValue()) + return l().getValue() % rv; + error(loc + ": modulo by zero"); + return 0; + }; + } + if (op == "<<") + return [=] { return l().getValue() << r().getValue(); }; + if (op == ">>") + return [=] { return l().getValue() >> r().getValue(); }; + if (op == "<") + return [=] { return l().getValue() < r().getValue(); }; + if (op == ">") + return [=] { return l().getValue() > r().getValue(); }; + if (op == ">=") + return [=] { return l().getValue() >= r().getValue(); }; + if (op == "<=") + return [=] { return l().getValue() <= r().getValue(); }; + if (op == "==") + return [=] { return l().getValue() == r().getValue(); }; + if (op == "!=") + return [=] { return l().getValue() != r().getValue(); }; + if (op == "||") + return [=] { return l().getValue() || r().getValue(); }; + if (op == "&&") + return [=] { return l().getValue() && r().getValue(); }; + if (op == "&") + return [=] { return bitAnd(l(), r()); }; + if (op == "|") + return [=] { return bitOr(l(), r()); }; + llvm_unreachable("invalid operator"); +} + +// This is a part of the operator-precedence parser. This function +// assumes that the remaining token stream starts with an operator. +Expr ScriptParser::readExpr1(Expr lhs, int minPrec) { + while (!atEOF() && !errorCount()) { + // Read an operator and an expression. + StringRef op1 = peek(); + if (precedence(op1) < minPrec) + break; + if (consume("?")) + return readTernary(lhs); + skip(); + Expr rhs = readPrimary(); + + // Evaluate the remaining part of the expression first if the + // next operator has greater precedence than the previous one. + // For example, if we have read "+" and "3", and if the next + // operator is "*", then we'll evaluate 3 * ... part first. + while (!atEOF()) { + StringRef op2 = peek(); + if (precedence(op2) <= precedence(op1)) + break; + rhs = readExpr1(rhs, precedence(op2)); + } + + lhs = combine(op1, lhs, rhs); + } + return lhs; +} + +Expr ScriptParser::getPageSize() { + return [] { return 0xFFFF; }; // Wasm page size is 65k. +} + +Expr ScriptParser::readConstant() { + StringRef s = readParenLiteral(); + if (s == "COMMONPAGESIZE") + return getPageSize(); + if (s == "MAXPAGESIZE") + return getPageSize(); + setError("unknown constant: " + s); + return [] { return 0; }; +} + +// Parses Tok as an integer. It recognizes hexadecimal (prefixed with +// "0x" or suffixed with "H") and decimal numbers. Decimal numbers may +// have "K" (Ki) or "M" (Mi) suffixes. +static std::optional parseInt(StringRef tok) { + // Hexadecimal + uint64_t val; + if (tok.starts_with_insensitive("0x")) { + if (!to_integer(tok.substr(2), val, 16)) + return std::nullopt; + return val; + } + if (tok.ends_with_insensitive("H")) { + if (!to_integer(tok.drop_back(), val, 16)) + return std::nullopt; + return val; + } + + // Decimal + if (tok.ends_with_insensitive("K")) { + if (!to_integer(tok.drop_back(), val, 10)) + return std::nullopt; + return val * 1024; + } + if (tok.ends_with_insensitive("M")) { + if (!to_integer(tok.drop_back(), val, 10)) + return std::nullopt; + return val * 1024 * 1024; + } + if (!to_integer(tok, val, 10)) + return std::nullopt; + return val; +} + +ByteCommand *ScriptParser::readByteCommand(StringRef tok) { + int size = StringSwitch(tok) + .Case("BYTE", 1) + .Case("SHORT", 2) + .Case("LONG", 4) + .Case("QUAD", 8) + .Default(-1); + if (size == -1) + return nullptr; + + size_t oldPos = pos; + Expr e = readParenExpr(); + std::string commandString = + tok.str() + " " + + llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " "); + return make(e, size, commandString); +} + +StringRef ScriptParser::readParenLiteral() { + expect("("); + bool orig = inExpr; + inExpr = false; + StringRef tok = next(); + inExpr = orig; + expect(")"); + return tok; +} + +static void checkIfExists(const SectionBase &osec, StringRef location) { + if (osec.location.empty()) + error(location + ": undefined section " + osec.name); +} + +static bool isValidSymbolName(StringRef s) { + auto valid = [](char c) { + return isAlnum(c) || c == '$' || c == '.' || c == '_'; + }; + return !s.empty() && !isDigit(s[0]) && llvm::all_of(s, valid); +} + +Expr ScriptParser::readPrimary() { + if (peek() == "(") + return readParenExpr(); + + if (consume("~")) { + Expr e = readPrimary(); + return [=] { return ~e().getValue(); }; + } + if (consume("!")) { + Expr e = readPrimary(); + return [=] { return !e().getValue(); }; + } + if (consume("-")) { + Expr e = readPrimary(); + return [=] { return -e().getValue(); }; + } + + StringRef tok = next(); + std::string location = getCurrentLocation(); + + // Built-in functions are parsed here. + // https://sourceware.org/binutils/docs/ld/Builtin-Functions.html. + if (tok == "ABSOLUTE") { + Expr inner = readParenExpr(); + return [=] { + ExprValue i = inner(); + i.forceAbsolute = true; + return i; + }; + } + if (tok == "ADDR") { + StringRef name = readParenLiteral(); + SectionBase *osec = &getOrCreateOutputSection(name)->osec; + //osec->usedInExpression = true; + return [=]() -> ExprValue { + checkIfExists(*osec, location); + return {osec, false, 0, location}; + }; + } + if (tok == "ALIGN") { + expect("("); + Expr e = readExpr(); + if (consume(")")) { + e = checkAlignment(e, location); + return [=] { return alignToPowerOf2(dot, e().getValue()); }; + } + expect(","); + Expr e2 = checkAlignment(readExpr(), location); + expect(")"); + return [=] { + ExprValue v = e(); + v.alignment = e2().getValue(); + return v; + }; + } + if (tok == "ALIGNOF") { + setError("ALIGNOF unsupported"); + StringRef name = readParenLiteral(); + SectionBase *osec = &getOrCreateOutputSection(name)->osec; + return [=] { + checkIfExists(*osec, location); + return 0;//osec->addralign; + }; + } + if (tok == "ASSERT") + return readAssert(); + if (tok == "CONSTANT") + return readConstant(); + if (tok == "DATA_SEGMENT_ALIGN") { + expect("("); + Expr e = readExpr(); + expect(","); + readExpr(); + expect(")"); + seenDataAlign = true; + return [=] { + uint64_t align = std::max(uint64_t(1), e().getValue()); + return (dot + align - 1) & -align; + }; + } + if (tok == "DATA_SEGMENT_END") { + expect("("); + expect("."); + expect(")"); + return [=] { return dot; }; // = added + } + if (tok == "DATA_SEGMENT_RELRO_END") { + setError("unsupported DATA_SEGMENT_RELRO_END"); + + // GNU linkers implements more complicated logic to handle + // DATA_SEGMENT_RELRO_END. We instead ignore the arguments and + // just align to the next page boundary for simplicity. + expect("("); + readExpr(); + expect(","); + readExpr(); + expect(")"); + seenRelroEnd = true; + Expr e = getPageSize(); + return [=] { return alignToPowerOf2(dot, e().getValue()); }; + } + if (tok == "DEFINED") { + StringRef name = unquote(readParenLiteral()); + return [=] { + Symbol *b = symtab->find(name); + return (b && b->isDefined()) ? 1 : 0; + }; + } + if (tok == "LENGTH") { + setError("LENGTH command not supported (no memory region support)"); + return 0; + } + if (tok == "LOADADDR") { + setError("LOADADDR unsuppported"); + /* + StringRef name = readParenLiteral(); + OutputSection *osec = &getOrCreateOutputSection(name)->osec; + osec->usedInExpression = true; + return [=] { + checkIfExists(*osec, location); + return osec->getLMA(); + }; + */ + } + if (tok == "LOG2CEIL") { + expect("("); + Expr a = readExpr(); + expect(")"); + return [=] { + // LOG2CEIL(0) is defined to be 0. + return llvm::Log2_64_Ceil(std::max(a().getValue(), UINT64_C(1))); + }; + } + if (tok == "MAX" || tok == "MIN") { + expect("("); + Expr a = readExpr(); + expect(","); + Expr b = readExpr(); + expect(")"); + if (tok == "MIN") + return [=] { return std::min(a().getValue(), b().getValue()); }; + return [=] { return std::max(a().getValue(), b().getValue()); }; + } + if (tok == "ORIGIN") { + setError("ORIGIN command not supported (no memory region support)"); + return 0; + } + if (tok == "SEGMENT_START") { + expect("("); + skip(); + expect(","); + Expr e = readExpr(); + expect(")"); + return [=] { return e(); }; + } + if (tok == "SIZEOF") { + setError("SIZEOF unsupported"); + //StringRef name = readParenLiteral(); + //SectionBase *cmd = &getOrCreateOutputSection(name)->osec; + // Linker script does not create an output section if its content is empty. + // We want to allow SIZEOF(.foo) where .foo is a section which happened to + // be empty. + return [=] { return 0;/*cmd->size;*/ }; + } + if (tok == "SIZEOF_HEADERS") + return [=] { return /*elf::getHeaderSize();*/ 0; }; + + // Tok is the dot. + if (tok == ".") + return [=] { return getSymbolValue(tok, location); }; + + // Tok is a literal number. + if (std::optional val = parseInt(tok)) + return [=] { return *val; }; + + // Tok is a symbol name. + if (tok.starts_with("\"")) + tok = unquote(tok); + else if (!isValidSymbolName(tok)) + setError("malformed number: " + tok); + //referencedSymbols.push_back(tok); + return [=] { return getSymbolValue(tok, location); }; +} + +Expr ScriptParser::readTernary(Expr cond) { + Expr l = readExpr(); + expect(":"); + Expr r = readExpr(); + return [=] { return cond().getValue() ? l() : r(); }; +} + +Expr ScriptParser::readParenExpr() { + expect("("); + Expr e = readExpr(); + expect(")"); + return e; +} + +OutputDesc *ScriptParser::createOutputSection(StringRef name, + StringRef location) { + OutputDesc *&secRef = nameToOutputSection[CachedHashStringRef(name)]; + OutputDesc *sec; + if (secRef && secRef->osec.location.empty()) { + // There was a forward reference. + sec = secRef; + } else { + sec = make(name); + if (!secRef) + secRef = sec; + } + sec->osec.location = std::string(location); + return sec; +} + +OutputDesc *ScriptParser::getOrCreateOutputSection(StringRef name) { + OutputDesc *&cmdRef = nameToOutputSection[CachedHashStringRef(name)]; + if (!cmdRef) + cmdRef = make(name); + return cmdRef; +} + +ExprValue ScriptParser::getSymbolValue(StringRef name, const Twine &loc) { + if (name == ".") { + //if (state) + // return {state->outSec, false, dot - state->outSec->addr, loc}; + return {nullptr, false, dot, loc}; + //error(loc + ": unable to get location counter value"); + //return 0; + } + + if (Symbol *sym = symtab->find(name)) { + if (auto *ds = dyn_cast(sym)) { + // A bit of a hack to support aliases outside of SECTIONS. + // This only works if the evaluation happpens after placement into the output. + uint64_t offset = ds->segment && ds->segment->outputSeg ? ds->segment->outputSeg->startVA + ds->segment->outputSegmentOffset : ds->value; + ExprValue v{nullptr, false, offset, loc}; + // Retain the original st_type, so that the alias will get the same + // behavior in relocation processing. Any operation will reset st_type to + // STT_NOTYPE. + // v.type = ds->type; + return v; + } + //if (isa(sym)) + // if (!errorOnMissingSection) + // return {nullptr, false, 0, loc}; + } + + error(loc + ": symbol not found: " + name); + return 0; +} diff --git a/lld/wasm/ScriptParser.h b/lld/wasm/ScriptParser.h new file mode 100644 index 000000000..c0a845e65 --- /dev/null +++ b/lld/wasm/ScriptParser.h @@ -0,0 +1,341 @@ +//===- ScriptParser.h -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_WASM_SCRIPT_PARSER_H +#define LLD_WASM_SCRIPT_PARSER_H + +#include "ScriptParser.h" +#include "OutputSections.h" +#include "ScriptLexer.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "lld/Common/CommonLinkerContext.h" +#include "lld/Common/Strings.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include +#include +#include + +namespace lld::wasm { + +// This enum is used to implement linker script SECTIONS command. +// https://sourceware.org/binutils/docs/ld/SECTIONS.html#SECTIONS +enum SectionsCommandKind { + AssignmentKind, // . = expr or = expr + OutputSectionKind, + InputSectionKind, + ByteKind // BYTE(expr), SHORT(expr), LONG(expr) or QUAD(expr) +}; + +struct SectionCommand { + SectionCommand(int k) : kind(k) {} + int kind; +}; + +class SectionBase { +public: +/* + enum Kind { Regular, Synthetic, EHFrame, Merge, Output }; + + Kind kind() const { return (Kind)sectionKind; } + + uint8_t sectionKind : 3; + + // The next two bit fields are only used by InputSectionBase, but we + // put them here so the struct packs better. + + uint8_t bss : 1; + + // Set for sections that should not be folded by ICF. + uint8_t keepUnique : 1; + + uint8_t partition = 1; +*/ +// uint32_t type; + //union { + OutputSection *outputSection; + //InputChunk *inputChunk; + //}; + + StringRef name; + + uint64_t address; + //uint32_t addralign; + bool live; + + SmallVector commands; + std::string location; + +/* + // The 1-indexed partition that this section is assigned to by the garbage + // collector, or 0 if this section is dead. Normally there is only one + // partition, so this will either be 0 or 1. + elf::Partition &getPartition() const; + + // These corresponds to the fields in Elf_Shdr. + uint64_t flags; + uint32_t addralign; + uint32_t entsize; + uint32_t link; + uint32_t info; +*/ + + OutputSection *getOutputSection() { return outputSection; } + const OutputSection *getOutputSection() const { + return const_cast(this)->getOutputSection(); + } + + // Translate an offset in the input section to an offset in the output + // section. + uint64_t getOffset(uint64_t offset) const { return offset; } + + uint64_t getVA(uint64_t offset = 0) const { return offset; }; + + bool isLive() const { return live; } //return partition != 0; } + void markLive() { live = 1; } + void markDead() { live = 0; } + + SectionBase(OutputSection *osec) : outputSection(osec), name(osec->name) {} + +/* +protected: + constexpr SectionBase(/*Kind sectionKind,/ StringRef name, uint64_t flags, + uint32_t entsize, uint32_t addralign, uint32_t type, + uint32_t info, uint32_t link) + : name (name) {} +// : sectionKind(sectionKind), bss(false), keepUnique(false), type(type), +// name(name), flags(flags), addralign(addralign), entsize(entsize), +// link(link), info(info) {} +*/ +}; + +// This represents an r-value in the linker script. +struct ExprValue { + ExprValue(SectionBase *sec, bool forceAbsolute, uint64_t val, + const Twine &loc) + : sec(sec), val(val), forceAbsolute(forceAbsolute), loc(loc.str()) {} + + ExprValue(uint64_t val) : ExprValue(nullptr, false, val, "") {} + + bool isAbsolute() const { return forceAbsolute || sec == nullptr; } + uint64_t getValue() const; + uint64_t getSecAddr() const; + uint64_t getSectionOffset() const; + + // If a value is relative to a section, it has a non-null Sec. + SectionBase *sec; + + uint64_t val; + uint64_t alignment = 1; + + // True if this expression is enclosed in ABSOLUTE(). + // This flag affects the return value of getValue(). + bool forceAbsolute; + + // Original source location. Used for error messages. + std::string loc; +}; + +// This represents an expression in the linker script. +// ScriptParser::readExpr reads an expression and returns an Expr. +// Later, we evaluate the expression by calling the function. +using Expr = std::function; + +// This represents ". = " or " = ". +struct SymbolAssignment : SectionCommand { + SymbolAssignment(StringRef name, Expr e, std::string loc) + : SectionCommand(AssignmentKind), name(name), expression(e), + location(loc) {} + + static bool classof(const SectionCommand *c) { + return c->kind == AssignmentKind; + } + + // The LHS of an expression. Name is either a symbol name or ".". + StringRef name; + DefinedData *sym = nullptr; + + // The RHS of an expression. + Expr expression; + + // Command attributes for PROVIDE, HIDDEN and PROVIDE_HIDDEN. + bool provide = false; + bool hidden = false; + + // Holds file name and line number for error reporting. + std::string location; + + // A string representation of this command. We use this for -Map. + std::string commandString; + + // Address of this assignment command. + uint64_t addr; + + // Size of this assignment command. This is usually 0, but if + // you move '.' this may be greater than 0. + uint64_t size; +}; + +struct OutputDesc final : SectionCommand { + SectionBase osec; + explicit OutputDesc(StringRef name) + : SectionCommand(OutputSectionKind), osec(make(ArrayRef())) { + osec.name = name; + } + + static bool classof(const SectionCommand *c) { + return c->kind == OutputSectionKind; + } +}; + +// For --sort-section and linkerscript sorting rules. +enum class SortSectionPolicy { Default, None, Alignment, Name, Priority }; + +// This struct represents one section match pattern in SECTIONS() command. +// It can optionally have negative match pattern for EXCLUDED_FILE command. +// Also it may be surrounded with SORT() command, so contains sorting rules. +class SectionPattern { + + // Cache of the most recent input argument and result of excludesFile(). + mutable std::optional> excludesFileCache; + +public: + SectionPattern(StringMatcher &&pat1, StringMatcher &&pat2) + : excludedFilePat(pat1), sectionPat(pat2), + sortOuter(SortSectionPolicy::Default), + sortInner(SortSectionPolicy::Default) {} + + bool excludesFile(const InputFile *file) const; + + StringMatcher excludedFilePat; + StringMatcher sectionPat; + SortSectionPolicy sortOuter; + SortSectionPolicy sortInner; +}; + +class InputSectionDescription : public SectionCommand { + // Cache of the most recent input argument and result of matchesFile(). + mutable std::optional> matchesFileCache; + +public: + InputSectionDescription(StringRef filePattern, uint64_t withFlags = 0, + uint64_t withoutFlags = 0) + : SectionCommand(InputSectionKind), filePat(filePattern), + withFlags(withFlags), withoutFlags(withoutFlags) {} + + static bool classof(const SectionCommand *c) { + return c->kind == InputSectionKind; + } + + bool matchesFile(const InputFile *file) const; + + SingleStringMatcher filePat; + + // Input sections that matches at least one of SectionPatterns + // will be associated with this InputSectionDescription. + SmallVector sectionPatterns; + + // Includes InputSections and MergeInputSections. Used temporarily during + // assignment of input sections to output sections. + //SmallVector sectionBases; + + // Used after the finalizeInputSections() pass. MergeInputSections have been + // merged into MergeSyntheticSections. + SmallVector sections; + + // Temporary record of synthetic ThunkSection instances and the pass that + // they were created in. This is used to insert newly created ThunkSections + // into Sections at the end of a createThunks() pass. + //SmallVector, 0> thunkSections; + + // SectionPatterns can be filtered with the INPUT_SECTION_FLAGS command. + uint64_t withFlags; + uint64_t withoutFlags; +}; + +// Represents BYTE(), SHORT(), LONG(), or QUAD(). +struct ByteCommand : SectionCommand { + ByteCommand(Expr e, unsigned size, std::string commandString) + : SectionCommand(ByteKind), commandString(commandString), expression(e), + size(size) {} + + static bool classof(const SectionCommand *c) { return c->kind == ByteKind; } + + // Keeps string representing the command. Used for -Map" is perhaps better. + std::string commandString; + + Expr expression; + + // This is just an offset of this assignment command in the output section. + unsigned offset; + + // Size of this data command. + unsigned size; +}; + +class ScriptParser final : ScriptLexer { +public: + ScriptParser(MemoryBufferRef mb) : ScriptLexer(mb) { } + + void readLinkerScript(); + +private: + void readOutput(); + void readSections(); + + SymbolAssignment *readSymbolAssignment(StringRef name); + ByteCommand *readByteCommand(StringRef tok); + std::array readFill(); + bool readSectionDirective(SectionBase *osec, StringRef tok1, StringRef tok2); + void readSectionAddressType(SectionBase *osec); + OutputDesc *readOutputSectionDescription(StringRef outSec); + InputSectionDescription *readInputSectionDescription(StringRef tok); + StringMatcher readFilePatterns(); + SmallVector readInputSectionsList(); + InputSectionDescription *readInputSectionRules(StringRef filePattern, + uint64_t withFlags, + uint64_t withoutFlags); + SortSectionPolicy peekSortKind(); + SortSectionPolicy readSortKind(); + SymbolAssignment *readProvideHidden(bool provide, bool hidden); + SymbolAssignment *readAssignment(StringRef tok); + void readSort(); + Expr readAssert(); + Expr readConstant(); + Expr getPageSize(); + + Expr combine(StringRef op, Expr l, Expr r); + Expr readExpr(); + Expr readExpr1(Expr lhs, int minPrec); + StringRef readParenLiteral(); + Expr readPrimary(); + Expr readTernary(Expr cond); + Expr readParenExpr(); + + bool seenDataAlign = false; + bool seenRelroEnd = false; + + // Moved from LinkerScript to here: + + OutputDesc *createOutputSection(StringRef name, StringRef location); + OutputDesc *getOrCreateOutputSection(StringRef name); + ExprValue getSymbolValue(StringRef name, const Twine &loc); + +public: + uint64_t dot = 0; + //SmallVector referencedSymbols; + SmallVector sectionCommands; + bool hasSectionsCommand = false; + SmallVector keptSections; + llvm::DenseMap nameToOutputSection; +}; + +} + +#endif diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp index d1a06c9ac..3f718a823 100644 --- a/lld/wasm/Writer.cpp +++ b/lld/wasm/Writer.cpp @@ -14,6 +14,7 @@ #include "OutputSections.h" #include "OutputSegment.h" #include "Relocations.h" +#include "ScriptParser.h" #include "SymbolTable.h" #include "SyntheticSections.h" #include "WriterUtils.h" @@ -92,6 +93,7 @@ private: OutputSegment *createOutputSegment(StringRef name); void combineOutputSegments(); void layoutMemory(); + void runScript(); void createHeader(); void addSection(OutputSection *sec); @@ -499,6 +501,231 @@ void Writer::layoutMemory() { } } +void Writer::runScript() { + if (ctx.isPic || config->relocatable || config->globalBase) { + error("any kind of position independent/dynamic code can't be used with manual memory layout"); + } else if (config->stackFirst) { + error("--stack-first can't be used with manual memory config (place it manually instead)"); + } + + llvm::SmallVector inputSegments; + for (ObjFile *file : ctx.objectFiles) { + for (InputChunk *segment : file->segments) { + if (!segment->live) + continue; + + inputSegments.push_back(segment); + } + } + + // Place segments using linker script. Also assign symbols. + uint64_t memoryPtr = 0; + { + llvm::TimeTraceScope timeScope("Run linker script", + config->linkerScript->getBufferIdentifier()); + ScriptParser parser{*config->linkerScript}; + parser.readLinkerScript(); + + auto handleScriptSymbol = [&] (SymbolAssignment* assign, bool inSec) { + StringRef name = assign->name; + if (name != ".") { + if (!isValidCIdentifier(name)) + return; + + assign->addr = parser.dot; + ExprValue v = assign->expression(); + uint64_t value = v.isAbsolute() ? v.getValue() : v.getSectionOffset(); + log("SCRIPT SET " + name + " to " + Twine(value) + ", dot was " + Twine(parser.dot)); + symtab->addOptionalDataSymbol(saver().save(name), value); + LLVM_DEBUG(dbgs() << "setSymbolAssignment: " << name << "\n"); + } else { //if (assign->sym) { + //if (inSec) { + // error("Assigning to . inside section is currently not supported"); + //} + + uint64_t val = assign->expression().getValue(); + if (val < parser.dot) + error(assign->location + ": unable to move location counter backward for: " + name); + + log("SCRIPT DOT " + name + " from " + Twine(parser.dot) + " to " + Twine(val)); + parser.dot = val; + LLVM_DEBUG(dbgs() << "dotSymbolAssignment: " << parser.dot << "\n"); + } + }; + + auto nameComparator = [](InputChunk *a, InputChunk *b) { + return a->name < b->name; + }; + + // Output sections need to have unique names. + // Example: + // osec->name: .rodata + // segment->name: .rodata.123 + // segment->inputSegments: vector of InputChunk:s with names: + // .rodata.foo + // .rodata.foo (yes, again) + // .rodata.bar + // .my.custom.name (i.e. does not have to start with e.g. .rodata) + size_t osecUid = 0; + for (SectionCommand *base : parser.sectionCommands) { + if (auto *osd = dyn_cast(base)) { + SectionBase *osec = &osd->osec; + + for (SectionCommand *cmd : osec->commands) { + if (auto *assign = dyn_cast(cmd)) { + handleScriptSymbol(assign, true); + } else if (auto *isd = dyn_cast(cmd)) { + // If dot is assigned or read while matching, we need to have new OutputSegments, + // so that the startVA can move (and the assignments will work). This means that + // there can be several output segments with the same name (a bit unfortunate). + OutputSegment *segment = make( + saver().save(osec->name + "." + Twine(osecUid++))); + segment->isBss = osec->name.starts_with(".bss"); + if (config->sharedMemory) + segment->initFlags = WASM_DATA_SEGMENT_IS_PASSIVE; + + for (const SectionPattern &pat : isd->sectionPatterns) { + if (!isd->filePat.isTrivialMatchAll() || !pat.excludedFilePat.empty()) + error("Only trivial wildcard patterns are supported for file (i.e. *), no excludes"); + + if (pat.sortInner != SortSectionPolicy::Default && + pat.sortInner != SortSectionPolicy::None) + error("Only one level of sorting currently supported in linker scripts"); + + if (pat.sortOuter != SortSectionPolicy::Default && + pat.sortOuter != SortSectionPolicy::None && + pat.sortOuter != SortSectionPolicy::Name) + error("Only sorting on name is currently supported in linker scripts"); + + auto sortStart = segment->inputSegments.end(); + for (InputChunk *chunk : inputSegments) { + // If an input is matched once, never match it again! (This is by spec.) + if (chunk->outputSeg) // Set by addInputSegment() below. + continue; + + if (!pat.sectionPat.match(chunk->name)) + //|| !isd->matchesFile(sec->file) || pat.excludesFile(sec->file)) + continue; + + log("MAPPING " + segment->name + " <--- " + chunk->name); + if (osec->name == "/DISCARD/") { + // The output section name `/DISCARD/' is special. + // Any input section assigned to it is discarded. + chunk->discarded = true; + } else { + segment->addInputSegment(chunk); // Sets chunk->outputSeg. + assert(chunk->outputSeg); + } + } + auto sortEnd = segment->inputSegments.end(); + + // Sorting happens on each pattern, for example *(.foo SORT(.bar.*) .baz) + if (pat.sortOuter == SortSectionPolicy::Name) + std::stable_sort(sortStart, sortEnd, nameComparator); + } + + if (osec->name != "/DISCARD/" && !segment->inputSegments.empty()) { + // The linker script will align dot directly itself. However, we might have to + // increase the alignment to what came from the input files, moving the dot too. + segment->finalizeInputSegments(); // Bake everything, so that we know the size. + log("SCRIPT PLACE " + segment->name + " with size " + Twine(segment->size) + + " dot: script " + Twine(parser.dot) + + " seg " + Twine(alignTo(parser.dot, 1ULL << segment->alignment))); + + parser.dot = alignTo(parser.dot, 1ULL << segment->alignment); + segment->startVA = parser.dot; + parser.dot += segment->size; + + log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}", segment->name, + segment->startVA, segment->size, segment->alignment)); + + segments.push_back(segment); + } + } + } + } else if (auto *assign = dyn_cast(base)) { + handleScriptSymbol(assign, false); + } + } + + // Place any remaining segments that were not discarded. + OutputSegment *bonusdata = createOutputSegment(".data.bonus"); // Will call segments.push_back() + OutputSegment *bonusbss = createOutputSegment(".bss.bonus"); // Will call segments.push_back() + for (InputChunk *chunk : inputSegments) { + if (!chunk->outputSeg && !chunk->discarded) { + log("BONUS <--- " + chunk->name); + (chunk->name.starts_with(".bss") ? bonusbss : bonusdata)->addInputSegment(chunk); + } + } + + bonusdata->finalizeInputSegments(); + parser.dot = alignTo(parser.dot, 1ULL << bonusdata->alignment); + bonusdata->startVA = parser.dot; + parser.dot += bonusdata->size; + + bonusbss->finalizeInputSegments(); + parser.dot = alignTo(parser.dot, 1ULL << bonusbss->alignment); + bonusbss->startVA = parser.dot; + parser.dot += bonusbss->size; + + memoryPtr = parser.dot; + } + + // This works fine if there is only one bss segment and it comes last. + // But we can/will have at least two, so let's fake index. + size_t nonIndex = 0; + for (size_t i = 0; i < segments.size(); ++i) + if (needsPassiveInitialization(segments[i]) && !segments[i]->isBss) + segments[i]->index = nonIndex++; + else + segments[i]->index = static_cast(-1); + + // Make space for the memory initialization flag + if (config->sharedMemory && hasPassiveInitializedSegments()) { + memoryPtr = alignTo(memoryPtr, 4); + WasmSym::initMemoryFlag = symtab->addSyntheticDataSymbol( + "__wasm_init_memory_flag", WASM_SYMBOL_VISIBILITY_HIDDEN); + WasmSym::initMemoryFlag->markLive(); + WasmSym::initMemoryFlag->setVA(memoryPtr); + log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}", + "__wasm_init_memory_flag", memoryPtr, 4, 4)); + memoryPtr += 4; + } + + memoryPtr = alignTo(memoryPtr, WasmPageSize); + out.memorySec->numMemoryPages = memoryPtr / WasmPageSize; + log("mem: total pages = " + Twine(out.memorySec->numMemoryPages)); + + uint64_t maxMemorySetting = 1ULL << (config->is64.value_or(false) ? 48 : 32); + if (config->initialMemory != 0) { + if (config->initialMemory != alignTo(config->initialMemory, WasmPageSize)) + error("initial memory must be " + Twine(WasmPageSize) + "-byte aligned"); + if (memoryPtr > config->initialMemory) + error("initial memory too small, " + Twine(memoryPtr) + " bytes needed"); + if (config->initialMemory > maxMemorySetting) + error("initial memory too large, cannot be greater than " + + Twine(maxMemorySetting)); + memoryPtr = config->initialMemory; + } + + if (config->maxMemory != 0) { + if (config->maxMemory != alignTo(config->maxMemory, WasmPageSize)) + error("maximum memory must be " + Twine(WasmPageSize) + "-byte aligned"); + if (memoryPtr > config->maxMemory) + error("maximum memory too small, " + Twine(memoryPtr) + " bytes needed"); + if (config->maxMemory > maxMemorySetting) + error("maximum memory too large, cannot be greater than " + + Twine(maxMemorySetting)); + } + + // Check max if explicitly supplied or required by shared memory + if (config->maxMemory != 0 || config->sharedMemory) { + uint64_t max = config->maxMemory ? config->maxMemory : memoryPtr; + out.memorySec->maxMemoryPages = max / WasmPageSize; + log("mem: max pages = " + Twine(out.memorySec->maxMemoryPages)); + } +} + void Writer::addSection(OutputSection *sec) { if (!sec->isNeeded()) return; @@ -1694,12 +1921,18 @@ void Writer::run() { WasmSym::definedTableBase32->setVA(config->tableBase); } - log("-- createOutputSegments"); - createOutputSegments(); log("-- createSyntheticSections"); createSyntheticSections(); - log("-- layoutMemory"); - layoutMemory(); + + if (!config->linkerScript) { + log("-- createOutputSegments"); + createOutputSegments(); + log("-- layoutMemory"); + layoutMemory(); + } else { + log("-- runScript"); + runScript(); + } if (!config->relocatable) { // Create linker synthesized __start_SECNAME/__stop_SECNAME symbols -- 2.25.1