#include "token.h" #include "buffer.h" #include #include #include /** * Easy-to-read and modify token-to-string mapping. * Order must match the Token enum in token.h. */ static const char* token_names[] = { "module", "import", "semicolon", "paren_open", "paren_close", "bracket_open", "bracket_close", "comma", "void", "identifier", }; struct TokenStream { Buffer* buffer; char lookahead; int has_lookahead; }; /** * Convert a Token enum to its string representation. * @param token The token to convert. * @returns The string name of the token. */ const char* token_to_string(Token token) { int count = sizeof(token_names) / sizeof(token_names[0]); if (token >= 0 && token < count) { return token_names[token]; } return "unknown"; } /** * Check if a character is the start of an identifier. */ static int is_identifier_start(char c) { return isalpha(c) || c == '_'; } /** * Check if a character can be part of an identifier. */ static int is_identifier_part(char c) { return isalnum(c) || c == '_'; } /** * Read a character, using lookahead if available. */ static char read_char(TokenStream* ts) { if (ts->has_lookahead) { ts->has_lookahead = 0; return ts->lookahead; } return buffer_read(ts->buffer); } /** * Put a character back into the lookahead buffer. */ static void unread_char(TokenStream* ts, char c) { if (c != (char)-1) { ts->lookahead = c; ts->has_lookahead = 1; } } /** * Try to read a keyword or identifier starting with the given character. * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword. */ static Token read_keyword_or_identifier(TokenStream* ts, char first) { char buffer[256]; int index = 0; buffer[index++] = first; char c; while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) { if (index < 255) { buffer[index++] = c; } } /* Put back the character that ended the identifier */ unread_char(ts, c); buffer[index] = '\0'; /* Check for keywords */ if (strcmp(buffer, "module") == 0) return TOKEN_MODULE; if (strcmp(buffer, "import") == 0) return TOKEN_IMPORT; if (strcmp(buffer, "void") == 0) return TOKEN_VOID; return TOKEN_IDENTIFIER; } TokenStream* tokenstream_open(const char* path) { if (path == NULL) return NULL; Buffer* buf = buffer_open_file(path); if (buf == NULL) return NULL; TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream)); if (ts == NULL) { buffer_close(buf); return NULL; } ts->buffer = buf; ts->lookahead = 0; ts->has_lookahead = 0; return ts; } void tokenstream_close(TokenStream* ts) { if (ts == NULL) return; buffer_close(ts->buffer); free(ts); } Token tokenstream_next(TokenStream* ts) { if (ts == NULL || ts->buffer == NULL) return -1; char c; /* Skip whitespace and comments */ while ((c = read_char(ts)) != (char)-1) { if (isspace(c)) { continue; } /* Handle comments */ if (c == '/') { char next = read_char(ts); if (next == '/') { /* Skip until end of line */ while ((c = read_char(ts)) != (char)-1 && c != '\n') { /* Skip */ } continue; } /* Put back the character after / */ unread_char(ts, next); return -1; } /* We found a non-whitespace, non-comment character */ break; } if (c == (char)-1) return -1; /* EOF */ /* Single-character tokens */ switch (c) { case '(': return TOKEN_PARENT_OPEN; case ')': return TOKEN_PARENT_CLOSE; case '[': return TOKEN_BRACKET_OPEN; case ']': return TOKEN_BRACKET_CLOSE; case ',': return TOKEN_COMMA; case ';': return TOKEN_SEMICOLON; } /* Keywords and identifiers */ if (is_identifier_start(c)) { return read_keyword_or_identifier(ts, c); } /* Unknown character */ return -1; }