Implement tokenstream_info and refactor TokenStream interface

This commit is contained in:
2026-04-24 14:28:57 +02:00
parent 1406cedd82
commit b6aaa0c08f
4 changed files with 106 additions and 75 deletions
+1
View File
@@ -35,6 +35,7 @@ static TestCase s_tests[] = {
{"tokenstream_comma", test_tokenstream_comma}, {"tokenstream_comma", test_tokenstream_comma},
{"tokenstream_whitespace_ignored", test_tokenstream_whitespace_ignored}, {"tokenstream_whitespace_ignored", test_tokenstream_whitespace_ignored},
{"tokenstream_void_function_signature", test_tokenstream_void_function_signature}, {"tokenstream_void_function_signature", test_tokenstream_void_function_signature},
{"tokenstream_info", test_tokenstream_info},
}; };
+29 -12
View File
@@ -1,5 +1,6 @@
#include "test.h" #include "test.h"
#include "token.h" #include "token.h"
#include <string.h>
static void test_tokenstream_open_fail(void) { static void test_tokenstream_open_fail(void) {
TokenStream* ts = tokenstream_open(NULL); TokenStream* ts = tokenstream_open(NULL);
@@ -7,8 +8,7 @@ static void test_tokenstream_open_fail(void) {
} }
static void test_tokenstream_simple_keyword(void) { static void test_tokenstream_simple_keyword(void) {
Buffer* buf = buffer_open_string("module"); TokenStream* ts = tokenstream_open("module");
TokenStream* ts = tokenstream_open(buf);
Token t = tokenstream_next(ts); Token t = tokenstream_next(ts);
if (t != TOKEN_MODULE) fail("expected TOKEN_MODULE"); if (t != TOKEN_MODULE) fail("expected TOKEN_MODULE");
@@ -20,8 +20,7 @@ static void test_tokenstream_simple_keyword(void) {
} }
static void test_tokenstream_keywords_and_symbols(void) { static void test_tokenstream_keywords_and_symbols(void) {
Buffer* buf = buffer_open_string("module main; import stdio;"); TokenStream* ts = tokenstream_open("module main; import stdio;");
TokenStream* ts = tokenstream_open(buf);
if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE"); if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER (main)"); if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER (main)");
@@ -35,8 +34,7 @@ static void test_tokenstream_keywords_and_symbols(void) {
} }
static void test_tokenstream_parentheses_and_brackets(void) { static void test_tokenstream_parentheses_and_brackets(void) {
Buffer* buf = buffer_open_string("()[]"); TokenStream* ts = tokenstream_open("()[]");
TokenStream* ts = tokenstream_open(buf);
if (tokenstream_next(ts) != TOKEN_PARENT_OPEN) fail("expected TOKEN_PARENT_OPEN"); if (tokenstream_next(ts) != TOKEN_PARENT_OPEN) fail("expected TOKEN_PARENT_OPEN");
if (tokenstream_next(ts) != TOKEN_PARENT_CLOSE) fail("expected TOKEN_PARENT_CLOSE"); if (tokenstream_next(ts) != TOKEN_PARENT_CLOSE) fail("expected TOKEN_PARENT_CLOSE");
@@ -48,8 +46,7 @@ static void test_tokenstream_parentheses_and_brackets(void) {
} }
static void test_tokenstream_comma(void) { static void test_tokenstream_comma(void) {
Buffer* buf = buffer_open_string("a,b,c"); TokenStream* ts = tokenstream_open("a,b,c");
TokenStream* ts = tokenstream_open(buf);
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected a"); if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected a");
if (tokenstream_next(ts) != TOKEN_COMMA) fail("expected comma"); if (tokenstream_next(ts) != TOKEN_COMMA) fail("expected comma");
@@ -62,8 +59,7 @@ static void test_tokenstream_comma(void) {
} }
static void test_tokenstream_whitespace_ignored(void) { static void test_tokenstream_whitespace_ignored(void) {
Buffer* buf = buffer_open_string(" module \n\t import ; "); TokenStream* ts = tokenstream_open(" module \n\t import ; ");
TokenStream* ts = tokenstream_open(buf);
if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE"); if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
if (tokenstream_next(ts) != TOKEN_IMPORT) fail("expected TOKEN_IMPORT"); if (tokenstream_next(ts) != TOKEN_IMPORT) fail("expected TOKEN_IMPORT");
@@ -74,8 +70,7 @@ static void test_tokenstream_whitespace_ignored(void) {
} }
static void test_tokenstream_void_function_signature(void) { static void test_tokenstream_void_function_signature(void) {
Buffer* buf = buffer_open_string("void main()"); TokenStream* ts = tokenstream_open("void main()");
TokenStream* ts = tokenstream_open(buf);
if (tokenstream_next(ts) != TOKEN_VOID) fail("expected TOKEN_VOID"); if (tokenstream_next(ts) != TOKEN_VOID) fail("expected TOKEN_VOID");
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER"); if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
@@ -85,3 +80,25 @@ static void test_tokenstream_void_function_signature(void) {
tokenstream_close(ts); tokenstream_close(ts);
} }
static void test_tokenstream_info(void) {
TokenStream* ts = tokenstream_open("module main;");
Token t1 = tokenstream_next(ts);
TokenInfo info1;
tokenstream_info(ts, &info1);
if (t1 != TOKEN_MODULE) fail("expected TOKEN_MODULE");
if (info1.token != TOKEN_MODULE) fail("info: expected TOKEN_MODULE");
if (info1.text_length != 6) fail("info: expected length 6");
if (strncmp(info1.text, "module", 6) != 0) fail("info: expected 'module'");
Token t2 = tokenstream_next(ts);
TokenInfo info2;
tokenstream_info(ts, &info2);
if (t2 != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
if (info2.token != TOKEN_IDENTIFIER) fail("info: expected TOKEN_IDENTIFIER");
if (info2.text_length != 4) fail("info: expected length 4");
if (strncmp(info2.text, "main", 4) != 0) fail("info: expected 'main'");
tokenstream_close(ts);
}
+62 -57
View File
@@ -1,9 +1,14 @@
#include "token.h" #include "token.h"
#include "buffer.h"
#include <stdlib.h> #include <stdlib.h>
#include <ctype.h> #include <ctype.h>
#include <string.h> #include <string.h>
struct TokenStream {
const char* code;
size_t pos;
TokenInfo last_info;
};
/** /**
* Easy-to-read and modify keyword-to-token mapping. * Easy-to-read and modify keyword-to-token mapping.
* Add new keywords here. * Add new keywords here.
@@ -23,22 +28,17 @@ static const KeywordMap keywords[] = {
* Look up a keyword in the keyword map. * Look up a keyword in the keyword map.
* Returns TOKEN_IDENTIFIER if not found. * Returns TOKEN_IDENTIFIER if not found.
*/ */
static Token lookup_keyword(const char* str) { static Token lookup_keyword(const char* str, size_t length) {
int count = sizeof(keywords) / sizeof(keywords[0]); int count = sizeof(keywords) / sizeof(keywords[0]);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
if (strcmp(keywords[i].keyword, str) == 0) { if (strlen(keywords[i].keyword) == length &&
strncmp(keywords[i].keyword, str, length) == 0) {
return keywords[i].token; return keywords[i].token;
} }
} }
return TOKEN_IDENTIFIER; return TOKEN_IDENTIFIER;
} }
struct TokenStream {
Buffer* buffer;
char lookahead;
int has_lookahead;
};
/** /**
* Check if a character is the start of an identifier. * Check if a character is the start of an identifier.
*/ */
@@ -54,72 +54,63 @@ static int is_identifier_part(char c) {
} }
/** /**
* Read a character, using lookahead if available. * Read a character from the stream.
*/ */
static char read_char(TokenStream* ts) { static char read_char(TokenStream* ts) {
if (ts->has_lookahead) { char c = ts->code[ts->pos];
ts->has_lookahead = 0; if (c == '\0') return (char)-1;
return ts->lookahead; ts->pos++;
} return c;
return buffer_read(ts->buffer);
} }
/** /**
* Put a character back into the lookahead buffer. * Peek at the next character in the stream.
*/ */
static void unread_char(TokenStream* ts, char c) { static char peek_char(TokenStream* ts) {
if (c != (char)-1) { char c = ts->code[ts->pos];
ts->lookahead = c; if (c == '\0') return (char)-1;
ts->has_lookahead = 1; return c;
}
} }
/**
* Try to read a keyword or identifier starting with the given character.
* Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
*/
static Token read_keyword_or_identifier(TokenStream* ts, char first) { static Token read_keyword_or_identifier(TokenStream* ts, char first) {
char buffer[256]; const char* start = &ts->code[ts->pos - 1];
int index = 0; size_t length = 1;
buffer[index++] = first;
char c; while (is_identifier_part(peek_char(ts))) {
while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) { read_char(ts);
if (index < 255) { length++;
buffer[index++] = c;
}
} }
/* Put back the character that ended the identifier */ Token token = lookup_keyword(start, length);
unread_char(ts, c); ts->last_info.token = token;
buffer[index] = '\0'; ts->last_info.text = (char*)start;
ts->last_info.text_length = length;
/* Check for keywords */ return token;
return lookup_keyword(buffer);
} }
TokenStream* tokenstream_open(Buffer* buffer) { TokenStream* tokenstream_open(const char* code) {
if (buffer == NULL) return NULL; if (code == NULL) return NULL;
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream)); TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
if (ts == NULL) { if (ts == NULL) {
return NULL; return NULL;
} }
ts->buffer = buffer; ts->code = code;
ts->lookahead = 0; ts->pos = 0;
ts->has_lookahead = 0; ts->last_info.text = NULL;
ts->last_info.text_length = 0;
ts->last_info.token = (Token)-1;
return ts; return ts;
} }
void tokenstream_close(TokenStream* ts) { void tokenstream_close(TokenStream* ts) {
if (ts == NULL) return; if (ts == NULL) return;
buffer_close(ts->buffer);
free(ts); free(ts);
} }
Token tokenstream_next(TokenStream* ts) { Token tokenstream_next(TokenStream* ts) {
if (ts == NULL || ts->buffer == NULL) return -1; if (ts == NULL) return -1;
char c; char c;
@@ -131,16 +122,14 @@ Token tokenstream_next(TokenStream* ts) {
/* Handle comments */ /* Handle comments */
if (c == '/') { if (c == '/') {
char next = read_char(ts); if (peek_char(ts) == '/') {
if (next == '/') {
/* Skip until end of line */ /* Skip until end of line */
while ((c = read_char(ts)) != (char)-1 && c != '\n') { while ((c = read_char(ts)) != (char)-1 && c != '\n') {
/* Skip */ /* Skip */
} }
continue; continue;
} }
/* Put back the character after / */ /* It's just a slash, which we don't handle yet */
unread_char(ts, next);
return -1; return -1;
} }
@@ -148,16 +137,24 @@ Token tokenstream_next(TokenStream* ts) {
break; break;
} }
if (c == (char)-1) return -1; /* EOF */ if (c == (char)-1) {
ts->last_info.token = (Token)-1;
ts->last_info.text = NULL;
ts->last_info.text_length = 0;
return -1; /* EOF */
}
/* Single-character tokens */ /* Single-character tokens */
ts->last_info.text = (char*)&ts->code[ts->pos - 1];
ts->last_info.text_length = 1;
switch (c) { switch (c) {
case '(': return TOKEN_PARENT_OPEN; case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
case ')': return TOKEN_PARENT_CLOSE; case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
case '[': return TOKEN_BRACKET_OPEN; case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
case ']': return TOKEN_BRACKET_CLOSE; case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
case ',': return TOKEN_COMMA; case ',': return ts->last_info.token = TOKEN_COMMA;
case ';': return TOKEN_SEMICOLON; case ';': return ts->last_info.token = TOKEN_SEMICOLON;
} }
/* Keywords and identifiers */ /* Keywords and identifiers */
@@ -166,5 +163,13 @@ Token tokenstream_next(TokenStream* ts) {
} }
/* Unknown character */ /* Unknown character */
ts->last_info.token = (Token)-1;
ts->last_info.text = NULL;
ts->last_info.text_length = 0;
return -1; return -1;
} }
void tokenstream_info(TokenStream* ts, TokenInfo* info) {
if (ts == NULL || info == NULL) return;
*info = ts->last_info;
}
+14 -6
View File
@@ -4,7 +4,7 @@
#ifndef TOKEN_H #ifndef TOKEN_H
#define TOKEN_H #define TOKEN_H
#include "buffer.h" #include <stddef.h>
/** /**
* A list of all possible tokens. * A list of all possible tokens.
@@ -34,6 +34,7 @@ typedef enum {
*/ */
typedef struct { typedef struct {
/// @brief The textual representation of a token. /// @brief The textual representation of a token.
/// Note that this is not necessarily null-terminated.
char* text; char* text;
/// @brief The length of the `text` string. /// @brief The length of the `text` string.
@@ -46,14 +47,12 @@ typedef struct {
typedef struct TokenStream TokenStream; typedef struct TokenStream TokenStream;
/** /**
* Returns a TokenStream for a given buffer. * Returns a TokenStream for a text.
* *
* When the tokenstream is closed, the underlying buffer is also closed. * @param code The text to read.
*
* @param buffer The buffer to read from.
* @returns A handle to the TokenStream. * @returns A handle to the TokenStream.
*/ */
TokenStream* tokenstream_open(Buffer* buffer); TokenStream* tokenstream_open(const char* code);
/** /**
* Closes a TokenStream. * Closes a TokenStream.
@@ -68,4 +67,13 @@ void tokenstream_close(TokenStream* ts);
*/ */
Token tokenstream_next(TokenStream* ts); Token tokenstream_next(TokenStream* ts);
/**
* Gets additional information about the last token that was returned
* by `tokenstream_next`.
*
* @param ts The TokenStream to use.
* @param info The TokenInfo object to store the results in.
*/
void tokenstream_info(TokenStream* ts, TokenInfo* info);
#endif #endif