From b6aaa0c08f4ec3f0ca31c02f2e77f7ddfb706d3f Mon Sep 17 00:00:00 2001 From: Sebastiaan de Schaetzen Date: Fri, 24 Apr 2026 14:28:57 +0200 Subject: [PATCH] Implement tokenstream_info and refactor TokenStream interface --- v0/test.c | 1 + v0/test_token.c | 41 ++++++++++++----- v0/token.c | 119 +++++++++++++++++++++++++----------------------- v0/token.h | 20 +++++--- 4 files changed, 106 insertions(+), 75 deletions(-) diff --git a/v0/test.c b/v0/test.c index 37bf03f..49fabfb 100644 --- a/v0/test.c +++ b/v0/test.c @@ -35,6 +35,7 @@ static TestCase s_tests[] = { {"tokenstream_comma", test_tokenstream_comma}, {"tokenstream_whitespace_ignored", test_tokenstream_whitespace_ignored}, {"tokenstream_void_function_signature", test_tokenstream_void_function_signature}, + {"tokenstream_info", test_tokenstream_info}, }; diff --git a/v0/test_token.c b/v0/test_token.c index 1b0eff4..2a2bcf9 100644 --- a/v0/test_token.c +++ b/v0/test_token.c @@ -1,5 +1,6 @@ #include "test.h" #include "token.h" +#include static void test_tokenstream_open_fail(void) { TokenStream* ts = tokenstream_open(NULL); @@ -7,8 +8,7 @@ static void test_tokenstream_open_fail(void) { } static void test_tokenstream_simple_keyword(void) { - Buffer* buf = buffer_open_string("module"); - TokenStream* ts = tokenstream_open(buf); + TokenStream* ts = tokenstream_open("module"); Token t = tokenstream_next(ts); if (t != TOKEN_MODULE) fail("expected TOKEN_MODULE"); @@ -20,8 +20,7 @@ static void test_tokenstream_simple_keyword(void) { } static void test_tokenstream_keywords_and_symbols(void) { - Buffer* buf = buffer_open_string("module main; import stdio;"); - TokenStream* ts = tokenstream_open(buf); + TokenStream* ts = tokenstream_open("module main; import stdio;"); if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE"); if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER (main)"); @@ -35,8 +34,7 @@ static void test_tokenstream_keywords_and_symbols(void) { } static void test_tokenstream_parentheses_and_brackets(void) { - Buffer* buf = buffer_open_string("()[]"); - TokenStream* ts = tokenstream_open(buf); + TokenStream* ts = tokenstream_open("()[]"); if (tokenstream_next(ts) != TOKEN_PARENT_OPEN) fail("expected TOKEN_PARENT_OPEN"); if (tokenstream_next(ts) != TOKEN_PARENT_CLOSE) fail("expected TOKEN_PARENT_CLOSE"); @@ -48,8 +46,7 @@ static void test_tokenstream_parentheses_and_brackets(void) { } static void test_tokenstream_comma(void) { - Buffer* buf = buffer_open_string("a,b,c"); - TokenStream* ts = tokenstream_open(buf); + TokenStream* ts = tokenstream_open("a,b,c"); if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected a"); if (tokenstream_next(ts) != TOKEN_COMMA) fail("expected comma"); @@ -62,8 +59,7 @@ static void test_tokenstream_comma(void) { } static void test_tokenstream_whitespace_ignored(void) { - Buffer* buf = buffer_open_string(" module \n\t import ; "); - TokenStream* ts = tokenstream_open(buf); + TokenStream* ts = tokenstream_open(" module \n\t import ; "); if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE"); if (tokenstream_next(ts) != TOKEN_IMPORT) fail("expected TOKEN_IMPORT"); @@ -74,8 +70,7 @@ static void test_tokenstream_whitespace_ignored(void) { } static void test_tokenstream_void_function_signature(void) { - Buffer* buf = buffer_open_string("void main()"); - TokenStream* ts = tokenstream_open(buf); + TokenStream* ts = tokenstream_open("void main()"); if (tokenstream_next(ts) != TOKEN_VOID) fail("expected TOKEN_VOID"); if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER"); @@ -85,3 +80,25 @@ static void test_tokenstream_void_function_signature(void) { tokenstream_close(ts); } + +static void test_tokenstream_info(void) { + TokenStream* ts = tokenstream_open("module main;"); + + Token t1 = tokenstream_next(ts); + TokenInfo info1; + tokenstream_info(ts, &info1); + if (t1 != TOKEN_MODULE) fail("expected TOKEN_MODULE"); + if (info1.token != TOKEN_MODULE) fail("info: expected TOKEN_MODULE"); + if (info1.text_length != 6) fail("info: expected length 6"); + if (strncmp(info1.text, "module", 6) != 0) fail("info: expected 'module'"); + + Token t2 = tokenstream_next(ts); + TokenInfo info2; + tokenstream_info(ts, &info2); + if (t2 != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER"); + if (info2.token != TOKEN_IDENTIFIER) fail("info: expected TOKEN_IDENTIFIER"); + if (info2.text_length != 4) fail("info: expected length 4"); + if (strncmp(info2.text, "main", 4) != 0) fail("info: expected 'main'"); + + tokenstream_close(ts); +} diff --git a/v0/token.c b/v0/token.c index 0f7f5af..07ae101 100644 --- a/v0/token.c +++ b/v0/token.c @@ -1,9 +1,14 @@ #include "token.h" -#include "buffer.h" #include #include #include +struct TokenStream { + const char* code; + size_t pos; + TokenInfo last_info; +}; + /** * Easy-to-read and modify keyword-to-token mapping. * Add new keywords here. @@ -23,22 +28,17 @@ static const KeywordMap keywords[] = { * Look up a keyword in the keyword map. * Returns TOKEN_IDENTIFIER if not found. */ -static Token lookup_keyword(const char* str) { +static Token lookup_keyword(const char* str, size_t length) { int count = sizeof(keywords) / sizeof(keywords[0]); for (int i = 0; i < count; i++) { - if (strcmp(keywords[i].keyword, str) == 0) { + if (strlen(keywords[i].keyword) == length && + strncmp(keywords[i].keyword, str, length) == 0) { return keywords[i].token; } } return TOKEN_IDENTIFIER; } -struct TokenStream { - Buffer* buffer; - char lookahead; - int has_lookahead; -}; - /** * Check if a character is the start of an identifier. */ @@ -54,72 +54,63 @@ static int is_identifier_part(char c) { } /** - * Read a character, using lookahead if available. + * Read a character from the stream. */ static char read_char(TokenStream* ts) { - if (ts->has_lookahead) { - ts->has_lookahead = 0; - return ts->lookahead; - } - return buffer_read(ts->buffer); + char c = ts->code[ts->pos]; + if (c == '\0') return (char)-1; + ts->pos++; + return c; } /** - * Put a character back into the lookahead buffer. + * Peek at the next character in the stream. */ -static void unread_char(TokenStream* ts, char c) { - if (c != (char)-1) { - ts->lookahead = c; - ts->has_lookahead = 1; - } +static char peek_char(TokenStream* ts) { + char c = ts->code[ts->pos]; + if (c == '\0') return (char)-1; + return c; } -/** - * Try to read a keyword or identifier starting with the given character. - * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword. - */ static Token read_keyword_or_identifier(TokenStream* ts, char first) { - char buffer[256]; - int index = 0; - buffer[index++] = first; + const char* start = &ts->code[ts->pos - 1]; + size_t length = 1; - char c; - while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) { - if (index < 255) { - buffer[index++] = c; - } + while (is_identifier_part(peek_char(ts))) { + read_char(ts); + length++; } - /* Put back the character that ended the identifier */ - unread_char(ts, c); - buffer[index] = '\0'; - - /* Check for keywords */ - return lookup_keyword(buffer); + Token token = lookup_keyword(start, length); + ts->last_info.token = token; + ts->last_info.text = (char*)start; + ts->last_info.text_length = length; + return token; } -TokenStream* tokenstream_open(Buffer* buffer) { - if (buffer == NULL) return NULL; +TokenStream* tokenstream_open(const char* code) { + if (code == NULL) return NULL; TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream)); if (ts == NULL) { return NULL; } - ts->buffer = buffer; - ts->lookahead = 0; - ts->has_lookahead = 0; + ts->code = code; + ts->pos = 0; + ts->last_info.text = NULL; + ts->last_info.text_length = 0; + ts->last_info.token = (Token)-1; return ts; } void tokenstream_close(TokenStream* ts) { if (ts == NULL) return; - buffer_close(ts->buffer); free(ts); } Token tokenstream_next(TokenStream* ts) { - if (ts == NULL || ts->buffer == NULL) return -1; + if (ts == NULL) return -1; char c; @@ -131,16 +122,14 @@ Token tokenstream_next(TokenStream* ts) { /* Handle comments */ if (c == '/') { - char next = read_char(ts); - if (next == '/') { + if (peek_char(ts) == '/') { /* Skip until end of line */ while ((c = read_char(ts)) != (char)-1 && c != '\n') { /* Skip */ } continue; } - /* Put back the character after / */ - unread_char(ts, next); + /* It's just a slash, which we don't handle yet */ return -1; } @@ -148,16 +137,24 @@ Token tokenstream_next(TokenStream* ts) { break; } - if (c == (char)-1) return -1; /* EOF */ + if (c == (char)-1) { + ts->last_info.token = (Token)-1; + ts->last_info.text = NULL; + ts->last_info.text_length = 0; + return -1; /* EOF */ + } /* Single-character tokens */ + ts->last_info.text = (char*)&ts->code[ts->pos - 1]; + ts->last_info.text_length = 1; + switch (c) { - case '(': return TOKEN_PARENT_OPEN; - case ')': return TOKEN_PARENT_CLOSE; - case '[': return TOKEN_BRACKET_OPEN; - case ']': return TOKEN_BRACKET_CLOSE; - case ',': return TOKEN_COMMA; - case ';': return TOKEN_SEMICOLON; + case '(': return ts->last_info.token = TOKEN_PARENT_OPEN; + case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE; + case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN; + case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE; + case ',': return ts->last_info.token = TOKEN_COMMA; + case ';': return ts->last_info.token = TOKEN_SEMICOLON; } /* Keywords and identifiers */ @@ -166,5 +163,13 @@ Token tokenstream_next(TokenStream* ts) { } /* Unknown character */ + ts->last_info.token = (Token)-1; + ts->last_info.text = NULL; + ts->last_info.text_length = 0; return -1; } + +void tokenstream_info(TokenStream* ts, TokenInfo* info) { + if (ts == NULL || info == NULL) return; + *info = ts->last_info; +} diff --git a/v0/token.h b/v0/token.h index 9482d68..5be1565 100644 --- a/v0/token.h +++ b/v0/token.h @@ -4,7 +4,7 @@ #ifndef TOKEN_H #define TOKEN_H -#include "buffer.h" +#include /** * A list of all possible tokens. @@ -34,6 +34,7 @@ typedef enum { */ typedef struct { /// @brief The textual representation of a token. + /// Note that this is not necessarily null-terminated. char* text; /// @brief The length of the `text` string. @@ -46,14 +47,12 @@ typedef struct { typedef struct TokenStream TokenStream; /** - * Returns a TokenStream for a given buffer. + * Returns a TokenStream for a text. * - * When the tokenstream is closed, the underlying buffer is also closed. - * - * @param buffer The buffer to read from. + * @param code The text to read. * @returns A handle to the TokenStream. */ -TokenStream* tokenstream_open(Buffer* buffer); +TokenStream* tokenstream_open(const char* code); /** * Closes a TokenStream. @@ -68,4 +67,13 @@ void tokenstream_close(TokenStream* ts); */ Token tokenstream_next(TokenStream* ts); +/** + * Gets additional information about the last token that was returned + * by `tokenstream_next`. + * + * @param ts The TokenStream to use. + * @param info The TokenInfo object to store the results in. + */ +void tokenstream_info(TokenStream* ts, TokenInfo* info); + #endif \ No newline at end of file