From eb4b0495f250e5f9746ecbae2685579e4ebcdbe7 Mon Sep 17 00:00:00 2001 From: Sebastiaan de Schaetzen Date: Wed, 29 Apr 2026 14:36:42 +0200 Subject: [PATCH] Working on parser refactor --- v0/ast.h | 3 + v0/include.mk | 2 +- v0/log.c | 4 +- v0/log.h | 3 +- v0/parser.c | 349 +++++++++++++++++++----------- v0/str.c | 11 + v0/str.h | 10 + v0/test_log.c | 4 +- v0/tests/parser_public_imports.c2 | 3 +- v0/token.c | 2 +- 10 files changed, 253 insertions(+), 138 deletions(-) create mode 100644 v0/str.c diff --git a/v0/ast.h b/v0/ast.h index b6010ed..069298d 100644 --- a/v0/ast.h +++ b/v0/ast.h @@ -57,6 +57,9 @@ typedef struct { /** @brief The value of the alias. */ TypeExpression value; + + /** @brief Whether the import is public or not. */ + bool is_public; } AliasDeclaration; /** diff --git a/v0/include.mk b/v0/include.mk index 5185d04..b1ba040 100644 --- a/v0/include.mk +++ b/v0/include.mk @@ -1,4 +1,4 @@ -V0_SRC := v0/main.c v0/util.c v0/token.c v0/parser.c v0/log.c +V0_SRC := v0/main.c v0/util.c v0/token.c v0/parser.c v0/log.c v0/str.c # V0_TEST must only include `v0/test.c` itself, as all other test C–source files are # included directly into `v0/test.c` using `#include "test_xyz.c"`. diff --git a/v0/log.c b/v0/log.c index f8a2176..a2013d6 100644 --- a/v0/log.c +++ b/v0/log.c @@ -20,7 +20,7 @@ void log_error(const char* msg) { } } -void log_on_line(Location* loc, int to_column, const char* msg, ...) { +void log_on_line(Location* loc, const char* msg, ...) { /* Declarations first for C89 */ char* line_prefix = NULL; char* formatted_msg = NULL; @@ -36,7 +36,7 @@ void log_on_line(Location* loc, int to_column, const char* msg, ...) { line_prefix = format_string("%d| ", loc->line); if (!line_prefix) goto cleanup; - caret_len = to_column - loc->column_start + 1; + caret_len = loc->column_end - loc->column_start + 1; if (caret_len < 1) caret_len = 1; /* Format the message */ diff --git a/v0/log.h b/v0/log.h index 3494903..766e543 100644 --- a/v0/log.h +++ b/v0/log.h @@ -28,10 +28,9 @@ void log_error(const char* msg); * It additionally supports the `%S` format specifier, which can be used to format a `String` structure from `string.h`. * * @param loc The location where the error occurred. - * @param to_column The column number where the error ends. * @param msg The error message to log. This can contain format specifiers like printf, and the additional arguments will be formatted into the message. * @param ... Additional arguments to format into the error message. */ -void log_on_line(Location* loc, int to_column, const char* msg, ...); +void log_on_line(Location* loc, const char* msg, ...); #endif diff --git a/v0/parser.c b/v0/parser.c index 1f564e9..35b7e0a 100644 --- a/v0/parser.c +++ b/v0/parser.c @@ -4,165 +4,256 @@ #include #include -/** - * Parses an import declaration. - * - * @param ts The token stream to parse from. - * @param module The module being parsed. - * @returns true on success, false on failure. - */ -static bool parse_import(TokenStream* ts, Module* module) { - ImportDeclaration* new_imports = realloc(module->imports, (module->import_count + 1) * sizeof(ImportDeclaration)); - if (!new_imports) { - fprintf(stderr, "Out of memory\n"); - exit(1); - } - module->imports = new_imports; +// /** +// * Parses an import declaration. +// * +// * @param ts The token stream to parse from. +// * @param module The module being parsed. +// * @returns true on success, false on failure. +// */ +// static bool parse_import(TokenStream* ts, Module* module) { +// ImportDeclaration* new_imports = realloc(module->imports, (module->import_count + 1) * sizeof(ImportDeclaration)); +// if (!new_imports) { +// fprintf(stderr, "Out of memory\n"); +// exit(1); +// } +// module->imports = new_imports; - Token t = tokenstream_next(ts); - bool is_public = false; - if (t.token == TOKEN_PUBLIC) { - is_public = true; - t = tokenstream_next(ts); - } +// Token t = tokenstream_next(ts); +// bool is_public = false; +// if (t.token == TOKEN_PUBLIC) { +// is_public = true; +// t = tokenstream_next(ts); +// } - if (t.token != TOKEN_IDENTIFIER) { - log_on_line(&t.location, t.location.column_end, "expected module name to import"); - return false; - } +// if (t.token != TOKEN_IDENTIFIER) { +// log_on_line(&t.location, t.location.column_end, "expected module name to import"); +// return false; +// } - char* name = (char*)malloc(t.text.length + 1); - memcpy(name, t.text.data, t.text.length); - name[t.text.length] = '\0'; +// char* name = (char*)malloc(t.text.length + 1); +// memcpy(name, t.text.data, t.text.length); +// name[t.text.length] = '\0'; - module->imports[module->import_count] = (ImportDeclaration){ .module_name = name, .is_public = is_public }; - module->import_count++; +// module->imports[module->import_count] = (ImportDeclaration){ .module_name = name, .is_public = is_public }; +// module->import_count++; - t = tokenstream_next(ts); - if (t.token != TOKEN_SEMICOLON) { - log_on_line(&t.location, t.location.column_end, "expected ';' after import"); - return false; - } - return true; +// t = tokenstream_next(ts); +// if (t.token != TOKEN_SEMICOLON) { +// log_on_line(&t.location, t.location.column_end, "expected ';' after import"); +// return false; +// } +// return true; +// } + +// /** +// * Parses an alias declaration. +// * +// * @param ts The token stream to parse from. +// * @param module The module being parsed. +// * @returns true on success, false on failure. +// */ +// static bool parse_alias(TokenStream* ts, Module* module) { +// AliasDeclaration* new_aliases = realloc(module->aliases, (module->alias_count + 1) * sizeof(AliasDeclaration)); +// if (!new_aliases) { +// fprintf(stderr, "Out of memory\n"); +// exit(1); +// } +// module->aliases = new_aliases; + +// Token t = tokenstream_next(ts); +// if (t.token != TOKEN_IDENTIFIER) { +// log_on_line(&t.location, t.location.column_end, "expected alias name"); +// return false; +// } +// char* name = (char*)malloc(t.text.length + 1); +// memcpy(name, t.text.data, t.text.length); +// name[t.text.length] = '\0'; +// AliasDeclaration alias; +// alias.name = name; + +// t = tokenstream_next(ts); +// if (t.token != TOKEN_ASSIGN) { +// log_on_line(&t.location, t.location.column_end, "expected '='"); +// return false; +// } + +// t = tokenstream_next(ts); + +// TypeExpression type; +// if (t.token == TOKEN_IDENTIFIER && strncmp(t.text.data, "int32", t.text.length) == 0) { +// type = (TypeExpression){ .tag = TYPE_EXPRESSION_BUILTIN, .builtin = { .bitSize = 32, .isSigned = true } }; +// t = tokenstream_next(ts); +// if (t.token == TOKEN_BRACKET_OPEN) { +// t = tokenstream_next(ts); +// if (t.token != TOKEN_BRACKET_CLOSE) { +// log_on_line(&t.location, t.location.column_end, "expected ']'"); +// return false; +// } +// TypeExpression* inner = malloc(sizeof(TypeExpression)); +// *inner = type; +// type = (TypeExpression){ .tag = TYPE_EXPRESSION_ARRAY, .array = { .array = inner } }; +// t = tokenstream_next(ts); +// } +// } else { +// log_on_line(&t.location, t.location.column_end, "expected type"); +// return false; +// } + +// alias.value = type; + +// module->aliases[module->alias_count] = alias; +// module->alias_count++; + +// if (t.token != TOKEN_SEMICOLON) { +// log_on_line(&t.location, t.location.column_end, "expected ';'"); +// return false; +// } +// return true; +// } + +typedef struct { + TokenStream* ts; + Token token; +} Parser; + +/** + * Reads a new token into p->token. + */ +static void parser_next_token(Parser* p) { + p->token = tokenstream_next(p->ts); } /** - * Parses an alias declaration. + * Reads a new token if the current token is equal to the expected token. * - * @param ts The token stream to parse from. - * @param module The module being parsed. - * @returns true on success, false on failure. + * If they are equal, it continues to the next token. + * + * @param p + * @param token The expected token. + * @returns `true` if the current token matches the expected, `false` if it does not. */ -static bool parse_alias(TokenStream* ts, Module* module) { - AliasDeclaration* new_aliases = realloc(module->aliases, (module->alias_count + 1) * sizeof(AliasDeclaration)); - if (!new_aliases) { - fprintf(stderr, "Out of memory\n"); - exit(1); +static bool parser_accept(Parser* p, TokenType token) { + if (p->token.token == token) { + parser_next_token(p); + return true; } - module->aliases = new_aliases; + return false; +} - Token t = tokenstream_next(ts); - if (t.token != TOKEN_IDENTIFIER) { - log_on_line(&t.location, t.location.column_end, "expected alias name"); - return false; +/** + * @copilot todo + */ +static bool parser_expect(Parser* p, TokenType token, const char* msg) { + if (parser_accept(p, token)) { + return true; } - char* name = (char*)malloc(t.text.length + 1); - memcpy(name, t.text.data, t.text.length); - name[t.text.length] = '\0'; - AliasDeclaration alias; - alias.name = name; + log_on_line(&p->token.location, msg); + return false; +} - t = tokenstream_next(ts); - if (t.token != TOKEN_ASSIGN) { - log_on_line(&t.location, t.location.column_end, "expected '='"); +/** + * @copilot todo add docs + */ +static bool parser_peek(Parser* p, TokenType token) { + if (p->token.token == token) { + return true; + } + return false; +} + +/** + * @copilot todo add docs + */ +static bool parser_require(Parser* p, TokenType token, const char* msg) { + if (parser_peek(p, token)) { + return true; + } + log_on_line(&p->token.location, msg); + return false; +} + +/** + * Converts the current token to a string. + * @copilot add proper docs + */ +static char* parser_to_text(Parser* p) { + char* str = string_copy(p->token.text); + parser_next_token(p); + return str; +} + +/** + * Parses the "module" keyword + */ +static bool parse_declaration_module(Parser* p, Module* module) { + if (!parser_expect(p, TOKEN_MODULE, "expected keyword 'module'")) { return false; } - t = tokenstream_next(ts); - - TypeExpression type; - if (t.token == TOKEN_IDENTIFIER && strncmp(t.text.data, "int32", t.text.length) == 0) { - type = (TypeExpression){ .tag = TYPE_EXPRESSION_BUILTIN, .builtin = { .bitSize = 32, .isSigned = true } }; - t = tokenstream_next(ts); - if (t.token == TOKEN_BRACKET_OPEN) { - t = tokenstream_next(ts); - if (t.token != TOKEN_BRACKET_CLOSE) { - log_on_line(&t.location, t.location.column_end, "expected ']'"); - return false; - } - TypeExpression* inner = malloc(sizeof(TypeExpression)); - *inner = type; - type = (TypeExpression){ .tag = TYPE_EXPRESSION_ARRAY, .array = { .array = inner } }; - t = tokenstream_next(ts); - } - } else { - log_on_line(&t.location, t.location.column_end, "expected type"); + if (!parser_require(p, TOKEN_IDENTIFIER, "expected module identifier")) { + return false; + } + module->name = parser_to_text(p); + + return parser_expect(p, TOKEN_SEMICOLON, "expected ';' after module name"); +} + +static bool parse_import(Parser* p, Module* module, bool is_public) { + module->import_count++; + module->imports = realloc(module->imports, sizeof(ImportDeclaration) * module->import_count); + ImportDeclaration* import = &module->imports[module->import_count - 1]; + + if (!parser_expect(p, TOKEN_IDENTIFIER, "expected module identifier")) { return false; } - alias.value = type; - - module->aliases[module->alias_count] = alias; - module->alias_count++; + import->module_name = parser_to_text(p); + import->is_public = is_public; + return true; +} - if (t.token != TOKEN_SEMICOLON) { - log_on_line(&t.location, t.location.column_end, "expected ';'"); - return false; - } +static bool parse_alias(Parser* p, Module* module, bool is_public) { + // @copilot implement return true; } Module* parser_parse(TokenStream* ts) { - Token t = tokenstream_next(ts); - if (t.token != TOKEN_MODULE) { - log_on_line(&t.location, t.location.column_end, "expected 'module' keyword"); - return NULL; - } + Parser* p = malloc(sizeof(Parser)); + p->ts = ts; + parser_next_token(p); - t = tokenstream_next(ts); - if (t.token != TOKEN_IDENTIFIER) { - log_on_line(&t.location, t.location.column_end, "expected module name"); - return NULL; + Module* module = malloc(sizeof(Module)); + if (!parse_declaration_module(p, module)) { + goto fail; } - Module* module = (Module*)malloc(sizeof(Module)); - if (module == NULL) { - fprintf(stderr, "Out of memory\n"); - exit(1); + while (!parser_peek(p, TOKEN_EOF)) { + bool is_public = false; + bool terminal = false; + do { + if (parser_accept(p, TOKEN_IMPORT)) { + if (!parse_import(p, module, is_public)) { + goto fail; + } + terminal = true; + } else if (parser_accept(p, TOKEN_ALIAS)) { + if (!parse_alias(p, module, is_public)) { + goto fail; + } + terminal = true; + } else if (parser_accept(p, TOKEN_PUBLIC)) { + is_public = true; + } else { + log_on_line(&p->token.location, "unexpected token"); + } + } while (!terminal); } - module->name = (const char*)malloc(t.text.length + 1); - memcpy((void*)module->name, t.text.data, t.text.length); - ((char*)module->name)[t.text.length] = '\0'; - module->imports = NULL; - module->import_count = 0; - module->aliases = NULL; - module->alias_count = 0; - t = tokenstream_next(ts); - if (t.token != TOKEN_SEMICOLON) { - log_on_line(&t.location, t.location.column_end, "expected ';' after module name"); - parser_free(module); - return NULL; - } - - while (1) { - t = tokenstream_next(ts); - if (t.token == TOKEN_IMPORT) { - if (!parse_import(ts, module)) { - parser_free(module); - return NULL; - } - } else if (t.token == TOKEN_ALIAS) { - if (!parse_alias(ts, module)) { - parser_free(module); - return NULL; - } - } else { - break; - } - } - - return module; + return module; +fail: + free(module); + return NULL; } void free_type_expression(TypeExpression* expr) { diff --git a/v0/str.c b/v0/str.c new file mode 100644 index 0000000..9b63572 --- /dev/null +++ b/v0/str.c @@ -0,0 +1,11 @@ +#include "str.h" + +#include +#include + +char* string_copy(String string) { + char* str = malloc(string.length + 1); + memcpy(str, string.data, string.length); + str[string.length] = '\0'; + return str; +} diff --git a/v0/str.h b/v0/str.h index 52ea019..464fd7c 100644 --- a/v0/str.h +++ b/v0/str.h @@ -14,4 +14,14 @@ typedef struct { size_t length; } String; +/** + * Creates a copy of a string. + * + * Note that this copy has to be freed afterwards. + * + * @param string The string to copy. + * @returns A null-terminated copy of the string. + */ +char* string_copy(String string); + #endif diff --git a/v0/test_log.c b/v0/test_log.c index 58dac83..7d171bb 100644 --- a/v0/test_log.c +++ b/v0/test_log.c @@ -34,7 +34,7 @@ static void test_log_on_line(void) { loc.column_start = 12; loc.column_end = 13; - log_on_line(&loc, 13, "unexpected token"); + log_on_line(&loc, "unexpected token"); assert_log_file("expected formatted error message"); } @@ -47,6 +47,6 @@ static void test_log_on_line_variadic(void) { loc.column_start = 12; loc.column_end = 13; - log_on_line(&loc, 13, "unexpected token '%c'", 'x'); + log_on_line(&loc, "unexpected token '%c'", 'x'); assert_log_file("expected formatted error message with variadic args"); } diff --git a/v0/tests/parser_public_imports.c2 b/v0/tests/parser_public_imports.c2 index fe81561..2a1c38a 100644 --- a/v0/tests/parser_public_imports.c2 +++ b/v0/tests/parser_public_imports.c2 @@ -1,2 +1,3 @@ module my_module; -import public other_module; + +public import other_module; diff --git a/v0/token.c b/v0/token.c index 7fab939..0648076 100644 --- a/v0/token.c +++ b/v0/token.c @@ -241,6 +241,6 @@ Token tokenstream_next(TokenStream* ts) { /* Unknown character */ t = create_token(ts, TOKEN_UNKNOWN, start_text, 1, start_line, start_column, line_start); - log_on_line(&t.location, t.location.column_end, "unexpected token '%c'", c); + log_on_line(&t.location, "unexpected token '%c'", c); return t; }