Token refactor and better logs

2026-04-24 20:28:08 +02:00
parent da3425ec10
commit 451a9a2a22
9 changed files with 210 additions and 122 deletions
@@ -6,7 +6,9 @@
 struct TokenStream {
 	const char* code;
 	size_t pos;
-	TokenInfo last_info;
+	int line;
+	int column;
+	const char* line_start;
 };

 /**
@@ -15,7 +17,7 @@ struct TokenStream {
 */
 typedef struct {
 	const char* keyword;
-	Token token;
+	TokenType token;
 } KeywordMap;

 static const KeywordMap keywords[] = {
@@ -28,7 +30,7 @@ static const KeywordMap keywords[] = {
 * Look up a keyword in the keyword map.
 * Returns TOKEN_IDENTIFIER if not found.
 */
-static Token lookup_keyword(const char* str, size_t length) {
+static TokenType lookup_keyword(const char* str, size_t length) {
 	int count = sizeof(keywords) / sizeof(keywords[0]);
 	for (int i = 0; i < count; i++) {
 		if (strlen(keywords[i].keyword) == length &&
@@ -53,39 +55,49 @@ static int is_identifier_part(char c) {
 	return isalnum(c) || c == '_';
 }

-/**
- * Read a character from the stream.
- */
-static char read_char(TokenStream* ts) {
-	char c = ts->code[ts->pos];
-	if (c == '\0') return (char)-1;
-	ts->pos++;
-	return c;
-}
-
 /**
 * Peek at the next character in the stream.
 */
 static char peek_char(TokenStream* ts) {
+	return ts->code[ts->pos];
+}
+
+/**
+ * Read a character from the stream and update position.
+ */
+static char read_char(TokenStream* ts) {
 	char c = ts->code[ts->pos];
-	if (c == '\0') return (char)-1;
+	if (c == '\0') return '\0';
+
+	ts->pos++;
+	if (c == '\n') {
+		ts->line++;
+		ts->column = 1;
+		ts->line_start = &ts->code[ts->pos];
+	} else {
+		ts->column++;
+	}
 	return c;
 }

-static Token read_keyword_or_identifier(TokenStream* ts, char first) {
-	const char* start = &ts->code[ts->pos - 1];
-	size_t length = 1;
-
-	while (is_identifier_part(peek_char(ts))) {
-		read_char(ts);
-		length++;
+static size_t get_line_length(const char* line_start) {
+	const char* p = line_start;
+	while (*p != '\n' && *p != '\0') {
+		p++;
 	}
+	return (size_t)(p - line_start);
+}

-	Token token = lookup_keyword(start, length);
-	ts->last_info.token = token;
-	ts->last_info.text = (char*)start;
-	ts->last_info.text_length = length;
-	return token;
+static Token create_token(TokenStream* ts, TokenType type, const char* text, size_t length, int line, int column, const char* line_start) {
+	Token t;
+	t.token = type;
+	t.text = (char*)text;
+	t.text_length = length;
+	t.line = line;
+	t.column = column;
+	t.line_text = (char*)line_start;
+	t.line_text_length = get_line_length(line_start);
+	return t;
 }

 TokenStream* tokenstream_open(const char* code) {
@@ -98,9 +110,9 @@ TokenStream* tokenstream_open(const char* code) {

 	ts->code = code;
 	ts->pos = 0;
-	ts->last_info.text = NULL;
-	ts->last_info.text_length = 0;
-	ts->last_info.token = (Token)-1;
+	ts->line = 1;
+	ts->column = 1;
+	ts->line_start = code;
 	return ts;
 }

@@ -110,66 +122,70 @@ void tokenstream_close(TokenStream* ts) {
 }

 Token tokenstream_next(TokenStream* ts) {
-	if (ts == NULL) return -1;
+	if (ts == NULL) {
+		Token t = {0};
+		t.token = TOKEN_EOF;
+		return t;
+	}

 	char c;

 	/* Skip whitespace and comments */
-	while ((c = read_char(ts)) != (char)-1) {
+	while ((c = peek_char(ts)) != '\0') {
 		if (isspace(c)) {
+			read_char(ts);
 			continue;
 		}

 		/* Handle comments */
 		if (c == '/') {
-			if (peek_char(ts) == '/') {
+			if (ts->code[ts->pos + 1] == '/') {
 				/* Skip until end of line */
-				while ((c = read_char(ts)) != (char)-1 && c != '\n') {
+				while ((c = read_char(ts)) != '\0' && c != '\n') {
 					/* Skip */
 				}
 				continue;
 			}
 			/* It's just a slash, which we don't handle yet */
-			return -1;
+			break;
 		}

 		/* We found a non-whitespace, non-comment character */
 		break;
 	}

-	if (c == (char)-1) {
-		ts->last_info.token = (Token)-1;
-		ts->last_info.text = NULL;
-		ts->last_info.text_length = 0;
-		return -1; /* EOF */
+	if (peek_char(ts) == '\0') {
+		return create_token(ts, TOKEN_EOF, NULL, 0, ts->line, ts->column, ts->line_start);
 	}

-	/* Single-character tokens */
-	ts->last_info.text = (char*)&ts->code[ts->pos - 1];
-	ts->last_info.text_length = 1;
+	int start_line = ts->line;
+	int start_column = ts->column;
+	const char* line_start = ts->line_start;
+	const char* start_text = &ts->code[ts->pos];

+	c = read_char(ts);
+
+	/* Single-character tokens */
 	switch (c) {
-	case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
-	case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
-	case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
-	case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
-	case ',': return ts->last_info.token = TOKEN_COMMA;
-	case ';': return ts->last_info.token = TOKEN_SEMICOLON;
+	case '(': return create_token(ts, TOKEN_PARENT_OPEN, start_text, 1, start_line, start_column, line_start);
+	case ')': return create_token(ts, TOKEN_PARENT_CLOSE, start_text, 1, start_line, start_column, line_start);
+	case '[': return create_token(ts, TOKEN_BRACKET_OPEN, start_text, 1, start_line, start_column, line_start);
+	case ']': return create_token(ts, TOKEN_BRACKET_CLOSE, start_text, 1, start_line, start_column, line_start);
+	case ',': return create_token(ts, TOKEN_COMMA, start_text, 1, start_line, start_column, line_start);
+	case ';': return create_token(ts, TOKEN_SEMICOLON, start_text, 1, start_line, start_column, line_start);
 	}

 	/* Keywords and identifiers */
 	if (is_identifier_start(c)) {
-		return read_keyword_or_identifier(ts, c);
+		size_t length = 1;
+		while (is_identifier_part(peek_char(ts))) {
+			read_char(ts);
+			length++;
+		}
+		TokenType type = lookup_keyword(start_text, length);
+		return create_token(ts, type, start_text, length, start_line, start_column, line_start);
 	}

 	/* Unknown character */
-	ts->last_info.token = (Token)-1;
-	ts->last_info.text = NULL;
-	ts->last_info.text_length = 0;
-	return -1;
-}
-
-void tokenstream_info(TokenStream* ts, TokenInfo* info) {
-	if (ts == NULL || info == NULL) return;
-	*info = ts->last_info;
+	return create_token(ts, TOKEN_UNKNOWN, start_text, 1, start_line, start_column, line_start);
 }