Implement tokenstream_info and refactor TokenStream interface

2026-04-24 14:28:57 +02:00
parent 1406cedd82
commit b6aaa0c08f
4 changed files with 106 additions and 75 deletions
@@ -35,6 +35,7 @@ static TestCase s_tests[] = {
    {"tokenstream_comma", test_tokenstream_comma},
    {"tokenstream_whitespace_ignored", test_tokenstream_whitespace_ignored},
    {"tokenstream_void_function_signature", test_tokenstream_void_function_signature},
+    {"tokenstream_info", test_tokenstream_info},
 };


@@ -1,5 +1,6 @@
 #include "test.h"
 #include "token.h"
+#include <string.h>

 static void test_tokenstream_open_fail(void) {
 	TokenStream* ts = tokenstream_open(NULL);
@@ -7,8 +8,7 @@ static void test_tokenstream_open_fail(void) {
 }

 static void test_tokenstream_simple_keyword(void) {
-	Buffer* buf = buffer_open_string("module");
-	TokenStream* ts = tokenstream_open(buf);
+	TokenStream* ts = tokenstream_open("module");

 	Token t = tokenstream_next(ts);
 	if (t != TOKEN_MODULE) fail("expected TOKEN_MODULE");
@@ -20,8 +20,7 @@ static void test_tokenstream_simple_keyword(void) {
 }

 static void test_tokenstream_keywords_and_symbols(void) {
-	Buffer* buf = buffer_open_string("module main; import stdio;");
-	TokenStream* ts = tokenstream_open(buf);
+	TokenStream* ts = tokenstream_open("module main; import stdio;");

 	if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
 	if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER (main)");
@@ -35,8 +34,7 @@ static void test_tokenstream_keywords_and_symbols(void) {
 }

 static void test_tokenstream_parentheses_and_brackets(void) {
-	Buffer* buf = buffer_open_string("()[]");
-	TokenStream* ts = tokenstream_open(buf);
+	TokenStream* ts = tokenstream_open("()[]");

 	if (tokenstream_next(ts) != TOKEN_PARENT_OPEN) fail("expected TOKEN_PARENT_OPEN");
 	if (tokenstream_next(ts) != TOKEN_PARENT_CLOSE) fail("expected TOKEN_PARENT_CLOSE");
@@ -48,8 +46,7 @@ static void test_tokenstream_parentheses_and_brackets(void) {
 }

 static void test_tokenstream_comma(void) {
-	Buffer* buf = buffer_open_string("a,b,c");
-	TokenStream* ts = tokenstream_open(buf);
+	TokenStream* ts = tokenstream_open("a,b,c");

 	if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected a");
 	if (tokenstream_next(ts) != TOKEN_COMMA) fail("expected comma");
@@ -62,8 +59,7 @@ static void test_tokenstream_comma(void) {
 }

 static void test_tokenstream_whitespace_ignored(void) {
-	Buffer* buf = buffer_open_string("  module  \n\t import  ;  ");
-	TokenStream* ts = tokenstream_open(buf);
+	TokenStream* ts = tokenstream_open("  module  \n\t import  ;  ");

 	if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
 	if (tokenstream_next(ts) != TOKEN_IMPORT) fail("expected TOKEN_IMPORT");
@@ -74,8 +70,7 @@ static void test_tokenstream_whitespace_ignored(void) {
 }

 static void test_tokenstream_void_function_signature(void) {
-	Buffer* buf = buffer_open_string("void main()");
-	TokenStream* ts = tokenstream_open(buf);
+	TokenStream* ts = tokenstream_open("void main()");

 	if (tokenstream_next(ts) != TOKEN_VOID) fail("expected TOKEN_VOID");
 	if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
@@ -85,3 +80,25 @@ static void test_tokenstream_void_function_signature(void) {

 	tokenstream_close(ts);
 }
+
+static void test_tokenstream_info(void) {
+	TokenStream* ts = tokenstream_open("module main;");
+
+	Token t1 = tokenstream_next(ts);
+	TokenInfo info1;
+	tokenstream_info(ts, &info1);
+	if (t1 != TOKEN_MODULE) fail("expected TOKEN_MODULE");
+	if (info1.token != TOKEN_MODULE) fail("info: expected TOKEN_MODULE");
+	if (info1.text_length != 6) fail("info: expected length 6");
+	if (strncmp(info1.text, "module", 6) != 0) fail("info: expected 'module'");
+
+	Token t2 = tokenstream_next(ts);
+	TokenInfo info2;
+	tokenstream_info(ts, &info2);
+	if (t2 != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
+	if (info2.token != TOKEN_IDENTIFIER) fail("info: expected TOKEN_IDENTIFIER");
+	if (info2.text_length != 4) fail("info: expected length 4");
+	if (strncmp(info2.text, "main", 4) != 0) fail("info: expected 'main'");
+
+	tokenstream_close(ts);
+}
@@ -1,9 +1,14 @@
 #include "token.h"
-#include "buffer.h"
 #include <stdlib.h>
 #include <ctype.h>
 #include <string.h>

+struct TokenStream {
+	const char* code;
+	size_t pos;
+	TokenInfo last_info;
+};
+
 /**
 * Easy-to-read and modify keyword-to-token mapping.
 * Add new keywords here.
@@ -23,22 +28,17 @@ static const KeywordMap keywords[] = {
 * Look up a keyword in the keyword map.
 * Returns TOKEN_IDENTIFIER if not found.
 */
-static Token lookup_keyword(const char* str) {
+static Token lookup_keyword(const char* str, size_t length) {
 	int count = sizeof(keywords) / sizeof(keywords[0]);
 	for (int i = 0; i < count; i++) {
-		if (strcmp(keywords[i].keyword, str) == 0) {
+		if (strlen(keywords[i].keyword) == length &&
+		    strncmp(keywords[i].keyword, str, length) == 0) {
 			return keywords[i].token;
 		}
 	}
 	return TOKEN_IDENTIFIER;
 }

-struct TokenStream {
-	Buffer* buffer;
-	char lookahead;
-	int has_lookahead;
-};
-
 /**
 * Check if a character is the start of an identifier.
 */
@@ -54,72 +54,63 @@ static int is_identifier_part(char c) {
 }

 /**
- * Read a character, using lookahead if available.
+ * Read a character from the stream.
 */
 static char read_char(TokenStream* ts) {
-	if (ts->has_lookahead) {
-		ts->has_lookahead = 0;
-		return ts->lookahead;
-	}
-	return buffer_read(ts->buffer);
+	char c = ts->code[ts->pos];
+	if (c == '\0') return (char)-1;
+	ts->pos++;
+	return c;
 }

 /**
- * Put a character back into the lookahead buffer.
+ * Peek at the next character in the stream.
 */
-static void unread_char(TokenStream* ts, char c) {
-	if (c != (char)-1) {
-		ts->lookahead = c;
-		ts->has_lookahead = 1;
-	}
+static char peek_char(TokenStream* ts) {
+	char c = ts->code[ts->pos];
+	if (c == '\0') return (char)-1;
+	return c;
 }

-/**
- * Try to read a keyword or identifier starting with the given character.
- * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
- */
 static Token read_keyword_or_identifier(TokenStream* ts, char first) {
-	char buffer[256];
-	int index = 0;
-	buffer[index++] = first;
+	const char* start = &ts->code[ts->pos - 1];
+	size_t length = 1;

-	char c;
-	while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
-		if (index < 255) {
-			buffer[index++] = c;
-		}
+	while (is_identifier_part(peek_char(ts))) {
+		read_char(ts);
+		length++;
 	}

-	/* Put back the character that ended the identifier */
-	unread_char(ts, c);
-	buffer[index] = '\0';
-
-	/* Check for keywords */
-	return lookup_keyword(buffer);
+	Token token = lookup_keyword(start, length);
+	ts->last_info.token = token;
+	ts->last_info.text = (char*)start;
+	ts->last_info.text_length = length;
+	return token;
 }

-TokenStream* tokenstream_open(Buffer* buffer) {
-	if (buffer == NULL) return NULL;
+TokenStream* tokenstream_open(const char* code) {
+	if (code == NULL) return NULL;

 	TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
 	if (ts == NULL) {
 		return NULL;
 	}

-	ts->buffer = buffer;
-	ts->lookahead = 0;
-	ts->has_lookahead = 0;
+	ts->code = code;
+	ts->pos = 0;
+	ts->last_info.text = NULL;
+	ts->last_info.text_length = 0;
+	ts->last_info.token = (Token)-1;
 	return ts;
 }

 void tokenstream_close(TokenStream* ts) {
 	if (ts == NULL) return;
-	buffer_close(ts->buffer);
 	free(ts);
 }

 Token tokenstream_next(TokenStream* ts) {
-	if (ts == NULL || ts->buffer == NULL) return -1;
+	if (ts == NULL) return -1;

 	char c;

@@ -131,16 +122,14 @@ Token tokenstream_next(TokenStream* ts) {

 		/* Handle comments */
 		if (c == '/') {
-			char next = read_char(ts);
-			if (next == '/') {
+			if (peek_char(ts) == '/') {
 				/* Skip until end of line */
 				while ((c = read_char(ts)) != (char)-1 && c != '\n') {
 					/* Skip */
 				}
 				continue;
 			}
-			/* Put back the character after / */
-			unread_char(ts, next);
+			/* It's just a slash, which we don't handle yet */
 			return -1;
 		}

@@ -148,16 +137,24 @@ Token tokenstream_next(TokenStream* ts) {
 		break;
 	}

-	if (c == (char)-1) return -1; /* EOF */
+	if (c == (char)-1) {
+		ts->last_info.token = (Token)-1;
+		ts->last_info.text = NULL;
+		ts->last_info.text_length = 0;
+		return -1; /* EOF */
+	}

 	/* Single-character tokens */
+	ts->last_info.text = (char*)&ts->code[ts->pos - 1];
+	ts->last_info.text_length = 1;
+
 	switch (c) {
-	case '(': return TOKEN_PARENT_OPEN;
-	case ')': return TOKEN_PARENT_CLOSE;
-	case '[': return TOKEN_BRACKET_OPEN;
-	case ']': return TOKEN_BRACKET_CLOSE;
-	case ',': return TOKEN_COMMA;
-	case ';': return TOKEN_SEMICOLON;
+	case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
+	case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
+	case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
+	case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
+	case ',': return ts->last_info.token = TOKEN_COMMA;
+	case ';': return ts->last_info.token = TOKEN_SEMICOLON;
 	}

 	/* Keywords and identifiers */
@@ -166,5 +163,13 @@ Token tokenstream_next(TokenStream* ts) {
 	}

 	/* Unknown character */
+	ts->last_info.token = (Token)-1;
+	ts->last_info.text = NULL;
+	ts->last_info.text_length = 0;
 	return -1;
 }
+
+void tokenstream_info(TokenStream* ts, TokenInfo* info) {
+	if (ts == NULL || info == NULL) return;
+	*info = ts->last_info;
+}
@@ -4,7 +4,7 @@
 #ifndef TOKEN_H
 #define TOKEN_H

-#include "buffer.h"
+#include <stddef.h>

 /**
 * A list of all possible tokens.
@@ -34,6 +34,7 @@ typedef enum {
 */
 typedef struct {
 	/// @brief The textual representation of a token.
+	/// Note that this is not necessarily null-terminated.
 	char* text;

 	/// @brief The length of the `text` string.
@@ -46,14 +47,12 @@ typedef struct {
 typedef struct TokenStream TokenStream;

 /**
- * Returns a TokenStream for a given buffer.
+ * Returns a TokenStream for a text.
 * 
- * When the tokenstream is closed, the underlying buffer is also closed.
- * 
- * @param buffer The buffer to read from.
+ * @param code The text to read.
 * @returns A handle to the TokenStream.
 */
-TokenStream* tokenstream_open(Buffer* buffer);
+TokenStream* tokenstream_open(const char* code);

 /**
 * Closes a TokenStream.
@@ -68,4 +67,13 @@ void tokenstream_close(TokenStream* ts);
 */
 Token tokenstream_next(TokenStream* ts);

+/**
+ * Gets additional information about the last token that was returned
+ * by `tokenstream_next`.
+ * 
+ * @param ts The TokenStream to use.
+ * @param info The TokenInfo object to store the results in.
+ */
+void tokenstream_info(TokenStream* ts, TokenInfo* info);
+
 #endif