Files
c2/v0/token.c
T
seeseemelk dccdcb8ba5 Implement token.c with comprehensive tests and easy-to-modify token mapping
- Created token-to-string mapping array parallel to Token enum in token.c
- Implemented TokenStream with lookahead buffering for proper tokenization
- Implemented tokenstream_open/close/next functions with support for:
  - Keywords (module, import, void)
  - Symbols (parentheses, brackets, comma, semicolon)
  - Identifiers (alphanumeric starting with letter or underscore)
  - Comment skipping (// style)
  - Whitespace handling
- Added token_to_string function to token.h for token inspection
- Created comprehensive test suite (15 tests) covering all token types and edge cases
- All tests pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-24 09:35:18 +02:00

180 lines
3.7 KiB
C

#include "token.h"
#include "buffer.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
/**
* Easy-to-read and modify token-to-string mapping.
* Order must match the Token enum in token.h.
*/
static const char* token_names[] = {
"module",
"import",
"semicolon",
"paren_open",
"paren_close",
"bracket_open",
"bracket_close",
"comma",
"void",
"identifier",
};
struct TokenStream {
Buffer* buffer;
char lookahead;
int has_lookahead;
};
/**
* Convert a Token enum to its string representation.
* @param token The token to convert.
* @returns The string name of the token.
*/
const char* token_to_string(Token token) {
int count = sizeof(token_names) / sizeof(token_names[0]);
if (token >= 0 && token < count) {
return token_names[token];
}
return "unknown";
}
/**
* Check if a character is the start of an identifier.
*/
static int is_identifier_start(char c) {
return isalpha(c) || c == '_';
}
/**
* Check if a character can be part of an identifier.
*/
static int is_identifier_part(char c) {
return isalnum(c) || c == '_';
}
/**
* Read a character, using lookahead if available.
*/
static char read_char(TokenStream* ts) {
if (ts->has_lookahead) {
ts->has_lookahead = 0;
return ts->lookahead;
}
return buffer_read(ts->buffer);
}
/**
* Put a character back into the lookahead buffer.
*/
static void unread_char(TokenStream* ts, char c) {
if (c != (char)-1) {
ts->lookahead = c;
ts->has_lookahead = 1;
}
}
/**
* Try to read a keyword or identifier starting with the given character.
* Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
*/
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
char buffer[256];
int index = 0;
buffer[index++] = first;
char c;
while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
if (index < 255) {
buffer[index++] = c;
}
}
/* Put back the character that ended the identifier */
unread_char(ts, c);
buffer[index] = '\0';
/* Check for keywords */
if (strcmp(buffer, "module") == 0) return TOKEN_MODULE;
if (strcmp(buffer, "import") == 0) return TOKEN_IMPORT;
if (strcmp(buffer, "void") == 0) return TOKEN_VOID;
return TOKEN_IDENTIFIER;
}
TokenStream* tokenstream_open(const char* path) {
if (path == NULL) return NULL;
Buffer* buf = buffer_open_file(path);
if (buf == NULL) return NULL;
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
if (ts == NULL) {
buffer_close(buf);
return NULL;
}
ts->buffer = buf;
ts->lookahead = 0;
ts->has_lookahead = 0;
return ts;
}
void tokenstream_close(TokenStream* ts) {
if (ts == NULL) return;
buffer_close(ts->buffer);
free(ts);
}
Token tokenstream_next(TokenStream* ts) {
if (ts == NULL || ts->buffer == NULL) return -1;
char c;
/* Skip whitespace and comments */
while ((c = read_char(ts)) != (char)-1) {
if (isspace(c)) {
continue;
}
/* Handle comments */
if (c == '/') {
char next = read_char(ts);
if (next == '/') {
/* Skip until end of line */
while ((c = read_char(ts)) != (char)-1 && c != '\n') {
/* Skip */
}
continue;
}
/* Put back the character after / */
unread_char(ts, next);
return -1;
}
/* We found a non-whitespace, non-comment character */
break;
}
if (c == (char)-1) return -1; /* EOF */
/* Single-character tokens */
switch (c) {
case '(': return TOKEN_PARENT_OPEN;
case ')': return TOKEN_PARENT_CLOSE;
case '[': return TOKEN_BRACKET_OPEN;
case ']': return TOKEN_BRACKET_CLOSE;
case ',': return TOKEN_COMMA;
case ';': return TOKEN_SEMICOLON;
}
/* Keywords and identifiers */
if (is_identifier_start(c)) {
return read_keyword_or_identifier(ts, c);
}
/* Unknown character */
return -1;
}