#include "token.h" #include "buffer.h" #include #include #include /** * Easy-to-read and modify keyword-to-token mapping. * Add new keywords here. */ typedef struct { const char* keyword; Token token; } KeywordMap; static const KeywordMap keywords[] = { {"module", TOKEN_MODULE}, {"import", TOKEN_IMPORT}, {"void", TOKEN_VOID}, }; /** * Look up a keyword in the keyword map. * Returns TOKEN_IDENTIFIER if not found. */ static Token lookup_keyword(const char* str) { int count = sizeof(keywords) / sizeof(keywords[0]); for (int i = 0; i < count; i++) { if (strcmp(keywords[i].keyword, str) == 0) { return keywords[i].token; } } return TOKEN_IDENTIFIER; } struct TokenStream { Buffer* buffer; char lookahead; int has_lookahead; }; /** * Check if a character is the start of an identifier. */ static int is_identifier_start(char c) { return isalpha(c) || c == '_'; } /** * Check if a character can be part of an identifier. */ static int is_identifier_part(char c) { return isalnum(c) || c == '_'; } /** * Read a character, using lookahead if available. */ static char read_char(TokenStream* ts) { if (ts->has_lookahead) { ts->has_lookahead = 0; return ts->lookahead; } return buffer_read(ts->buffer); } /** * Put a character back into the lookahead buffer. */ static void unread_char(TokenStream* ts, char c) { if (c != (char)-1) { ts->lookahead = c; ts->has_lookahead = 1; } } /** * Try to read a keyword or identifier starting with the given character. * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword. */ static Token read_keyword_or_identifier(TokenStream* ts, char first) { char buffer[256]; int index = 0; buffer[index++] = first; char c; while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) { if (index < 255) { buffer[index++] = c; } } /* Put back the character that ended the identifier */ unread_char(ts, c); buffer[index] = '\0'; /* Check for keywords */ return lookup_keyword(buffer); } TokenStream* tokenstream_open(Buffer* buffer) { if (buffer == NULL) return NULL; TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream)); if (ts == NULL) { return NULL; } ts->buffer = buffer; ts->lookahead = 0; ts->has_lookahead = 0; return ts; } void tokenstream_close(TokenStream* ts) { if (ts == NULL) return; buffer_close(ts->buffer); free(ts); } Token tokenstream_next(TokenStream* ts) { if (ts == NULL || ts->buffer == NULL) return -1; char c; /* Skip whitespace and comments */ while ((c = read_char(ts)) != (char)-1) { if (isspace(c)) { continue; } /* Handle comments */ if (c == '/') { char next = read_char(ts); if (next == '/') { /* Skip until end of line */ while ((c = read_char(ts)) != (char)-1 && c != '\n') { /* Skip */ } continue; } /* Put back the character after / */ unread_char(ts, next); return -1; } /* We found a non-whitespace, non-comment character */ break; } if (c == (char)-1) return -1; /* EOF */ /* Single-character tokens */ switch (c) { case '(': return TOKEN_PARENT_OPEN; case ')': return TOKEN_PARENT_CLOSE; case '[': return TOKEN_BRACKET_OPEN; case ']': return TOKEN_BRACKET_CLOSE; case ',': return TOKEN_COMMA; case ';': return TOKEN_SEMICOLON; } /* Keywords and identifiers */ if (is_identifier_start(c)) { return read_keyword_or_identifier(ts, c); } /* Unknown character */ return -1; }