c1106d8e66
The tokenstream_open function now takes a Buffer* parameter instead of a file path string, making the API more flexible and allowing the caller to manage buffer lifetime. The tokenstream_close function continues to close the underlying buffer as documented. - Changed tokenstream_open signature from (const char* path) to (Buffer* buffer) - Updated implementation to accept and use the provided buffer directly - Updated all tests to open buffers separately and pass them to tokenstream_open - Added #include "buffer.h" to token.h for Buffer type definition - All 15 tests pass Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
176 lines
3.6 KiB
C
176 lines
3.6 KiB
C
#include "token.h"
|
|
#include "buffer.h"
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
|
|
/**
|
|
* Easy-to-read and modify token-to-string mapping.
|
|
* Order must match the Token enum in token.h.
|
|
*/
|
|
static const char* token_names[] = {
|
|
"module",
|
|
"import",
|
|
"semicolon",
|
|
"paren_open",
|
|
"paren_close",
|
|
"bracket_open",
|
|
"bracket_close",
|
|
"comma",
|
|
"void",
|
|
"identifier",
|
|
};
|
|
|
|
struct TokenStream {
|
|
Buffer* buffer;
|
|
char lookahead;
|
|
int has_lookahead;
|
|
};
|
|
|
|
/**
|
|
* Convert a Token enum to its string representation.
|
|
* @param token The token to convert.
|
|
* @returns The string name of the token.
|
|
*/
|
|
const char* token_to_string(Token token) {
|
|
int count = sizeof(token_names) / sizeof(token_names[0]);
|
|
if (token >= 0 && token < count) {
|
|
return token_names[token];
|
|
}
|
|
return "unknown";
|
|
}
|
|
|
|
/**
|
|
* Check if a character is the start of an identifier.
|
|
*/
|
|
static int is_identifier_start(char c) {
|
|
return isalpha(c) || c == '_';
|
|
}
|
|
|
|
/**
|
|
* Check if a character can be part of an identifier.
|
|
*/
|
|
static int is_identifier_part(char c) {
|
|
return isalnum(c) || c == '_';
|
|
}
|
|
|
|
/**
|
|
* Read a character, using lookahead if available.
|
|
*/
|
|
static char read_char(TokenStream* ts) {
|
|
if (ts->has_lookahead) {
|
|
ts->has_lookahead = 0;
|
|
return ts->lookahead;
|
|
}
|
|
return buffer_read(ts->buffer);
|
|
}
|
|
|
|
/**
|
|
* Put a character back into the lookahead buffer.
|
|
*/
|
|
static void unread_char(TokenStream* ts, char c) {
|
|
if (c != (char)-1) {
|
|
ts->lookahead = c;
|
|
ts->has_lookahead = 1;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Try to read a keyword or identifier starting with the given character.
|
|
* Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
|
|
*/
|
|
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
|
|
char buffer[256];
|
|
int index = 0;
|
|
buffer[index++] = first;
|
|
|
|
char c;
|
|
while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
|
|
if (index < 255) {
|
|
buffer[index++] = c;
|
|
}
|
|
}
|
|
|
|
/* Put back the character that ended the identifier */
|
|
unread_char(ts, c);
|
|
buffer[index] = '\0';
|
|
|
|
/* Check for keywords */
|
|
if (strcmp(buffer, "module") == 0) return TOKEN_MODULE;
|
|
if (strcmp(buffer, "import") == 0) return TOKEN_IMPORT;
|
|
if (strcmp(buffer, "void") == 0) return TOKEN_VOID;
|
|
|
|
return TOKEN_IDENTIFIER;
|
|
}
|
|
|
|
TokenStream* tokenstream_open(Buffer* buffer) {
|
|
if (buffer == NULL) return NULL;
|
|
|
|
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
|
|
if (ts == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
ts->buffer = buffer;
|
|
ts->lookahead = 0;
|
|
ts->has_lookahead = 0;
|
|
return ts;
|
|
}
|
|
|
|
void tokenstream_close(TokenStream* ts) {
|
|
if (ts == NULL) return;
|
|
buffer_close(ts->buffer);
|
|
free(ts);
|
|
}
|
|
|
|
Token tokenstream_next(TokenStream* ts) {
|
|
if (ts == NULL || ts->buffer == NULL) return -1;
|
|
|
|
char c;
|
|
|
|
/* Skip whitespace and comments */
|
|
while ((c = read_char(ts)) != (char)-1) {
|
|
if (isspace(c)) {
|
|
continue;
|
|
}
|
|
|
|
/* Handle comments */
|
|
if (c == '/') {
|
|
char next = read_char(ts);
|
|
if (next == '/') {
|
|
/* Skip until end of line */
|
|
while ((c = read_char(ts)) != (char)-1 && c != '\n') {
|
|
/* Skip */
|
|
}
|
|
continue;
|
|
}
|
|
/* Put back the character after / */
|
|
unread_char(ts, next);
|
|
return -1;
|
|
}
|
|
|
|
/* We found a non-whitespace, non-comment character */
|
|
break;
|
|
}
|
|
|
|
if (c == (char)-1) return -1; /* EOF */
|
|
|
|
/* Single-character tokens */
|
|
switch (c) {
|
|
case '(': return TOKEN_PARENT_OPEN;
|
|
case ')': return TOKEN_PARENT_CLOSE;
|
|
case '[': return TOKEN_BRACKET_OPEN;
|
|
case ']': return TOKEN_BRACKET_CLOSE;
|
|
case ',': return TOKEN_COMMA;
|
|
case ';': return TOKEN_SEMICOLON;
|
|
}
|
|
|
|
/* Keywords and identifiers */
|
|
if (is_identifier_start(c)) {
|
|
return read_keyword_or_identifier(ts, c);
|
|
}
|
|
|
|
/* Unknown character */
|
|
return -1;
|
|
}
|