Implement tokenstream_info and refactor TokenStream interface

2026-04-24 14:28:57 +02:00
parent 1406cedd82
commit b6aaa0c08f
4 changed files with 106 additions and 75 deletions
@@ -1,9 +1,14 @@
 #include "token.h"
-#include "buffer.h"
 #include <stdlib.h>
 #include <ctype.h>
 #include <string.h>

+struct TokenStream {
+	const char* code;
+	size_t pos;
+	TokenInfo last_info;
+};
+
 /**
 * Easy-to-read and modify keyword-to-token mapping.
 * Add new keywords here.
@@ -23,22 +28,17 @@ static const KeywordMap keywords[] = {
 * Look up a keyword in the keyword map.
 * Returns TOKEN_IDENTIFIER if not found.
 */
-static Token lookup_keyword(const char* str) {
+static Token lookup_keyword(const char* str, size_t length) {
 	int count = sizeof(keywords) / sizeof(keywords[0]);
 	for (int i = 0; i < count; i++) {
-		if (strcmp(keywords[i].keyword, str) == 0) {
+		if (strlen(keywords[i].keyword) == length &&
+		    strncmp(keywords[i].keyword, str, length) == 0) {
 			return keywords[i].token;
 		}
 	}
 	return TOKEN_IDENTIFIER;
 }

-struct TokenStream {
-	Buffer* buffer;
-	char lookahead;
-	int has_lookahead;
-};
-
 /**
 * Check if a character is the start of an identifier.
 */
@@ -54,72 +54,63 @@ static int is_identifier_part(char c) {
 }

 /**
- * Read a character, using lookahead if available.
+ * Read a character from the stream.
 */
 static char read_char(TokenStream* ts) {
-	if (ts->has_lookahead) {
-		ts->has_lookahead = 0;
-		return ts->lookahead;
-	}
-	return buffer_read(ts->buffer);
+	char c = ts->code[ts->pos];
+	if (c == '\0') return (char)-1;
+	ts->pos++;
+	return c;
 }

 /**
- * Put a character back into the lookahead buffer.
+ * Peek at the next character in the stream.
 */
-static void unread_char(TokenStream* ts, char c) {
-	if (c != (char)-1) {
-		ts->lookahead = c;
-		ts->has_lookahead = 1;
-	}
+static char peek_char(TokenStream* ts) {
+	char c = ts->code[ts->pos];
+	if (c == '\0') return (char)-1;
+	return c;
 }

-/**
- * Try to read a keyword or identifier starting with the given character.
- * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
- */
 static Token read_keyword_or_identifier(TokenStream* ts, char first) {
-	char buffer[256];
-	int index = 0;
-	buffer[index++] = first;
+	const char* start = &ts->code[ts->pos - 1];
+	size_t length = 1;

-	char c;
-	while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
-		if (index < 255) {
-			buffer[index++] = c;
-		}
+	while (is_identifier_part(peek_char(ts))) {
+		read_char(ts);
+		length++;
 	}

-	/* Put back the character that ended the identifier */
-	unread_char(ts, c);
-	buffer[index] = '\0';
-
-	/* Check for keywords */
-	return lookup_keyword(buffer);
+	Token token = lookup_keyword(start, length);
+	ts->last_info.token = token;
+	ts->last_info.text = (char*)start;
+	ts->last_info.text_length = length;
+	return token;
 }

-TokenStream* tokenstream_open(Buffer* buffer) {
-	if (buffer == NULL) return NULL;
+TokenStream* tokenstream_open(const char* code) {
+	if (code == NULL) return NULL;

 	TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
 	if (ts == NULL) {
 		return NULL;
 	}

-	ts->buffer = buffer;
-	ts->lookahead = 0;
-	ts->has_lookahead = 0;
+	ts->code = code;
+	ts->pos = 0;
+	ts->last_info.text = NULL;
+	ts->last_info.text_length = 0;
+	ts->last_info.token = (Token)-1;
 	return ts;
 }

 void tokenstream_close(TokenStream* ts) {
 	if (ts == NULL) return;
-	buffer_close(ts->buffer);
 	free(ts);
 }

 Token tokenstream_next(TokenStream* ts) {
-	if (ts == NULL || ts->buffer == NULL) return -1;
+	if (ts == NULL) return -1;

 	char c;

@@ -131,16 +122,14 @@ Token tokenstream_next(TokenStream* ts) {

 		/* Handle comments */
 		if (c == '/') {
-			char next = read_char(ts);
-			if (next == '/') {
+			if (peek_char(ts) == '/') {
 				/* Skip until end of line */
 				while ((c = read_char(ts)) != (char)-1 && c != '\n') {
 					/* Skip */
 				}
 				continue;
 			}
-			/* Put back the character after / */
-			unread_char(ts, next);
+			/* It's just a slash, which we don't handle yet */
 			return -1;
 		}

@@ -148,16 +137,24 @@ Token tokenstream_next(TokenStream* ts) {
 		break;
 	}

-	if (c == (char)-1) return -1; /* EOF */
+	if (c == (char)-1) {
+		ts->last_info.token = (Token)-1;
+		ts->last_info.text = NULL;
+		ts->last_info.text_length = 0;
+		return -1; /* EOF */
+	}

 	/* Single-character tokens */
+	ts->last_info.text = (char*)&ts->code[ts->pos - 1];
+	ts->last_info.text_length = 1;
+
 	switch (c) {
-	case '(': return TOKEN_PARENT_OPEN;
-	case ')': return TOKEN_PARENT_CLOSE;
-	case '[': return TOKEN_BRACKET_OPEN;
-	case ']': return TOKEN_BRACKET_CLOSE;
-	case ',': return TOKEN_COMMA;
-	case ';': return TOKEN_SEMICOLON;
+	case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
+	case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
+	case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
+	case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
+	case ',': return ts->last_info.token = TOKEN_COMMA;
+	case ';': return ts->last_info.token = TOKEN_SEMICOLON;
 	}

 	/* Keywords and identifiers */
@@ -166,5 +163,13 @@ Token tokenstream_next(TokenStream* ts) {
 	}

 	/* Unknown character */
+	ts->last_info.token = (Token)-1;
+	ts->last_info.text = NULL;
+	ts->last_info.text_length = 0;
 	return -1;
 }
+
+void tokenstream_info(TokenStream* ts, TokenInfo* info) {
+	if (ts == NULL || info == NULL) return;
+	*info = ts->last_info;
+}