Files
c2/v0/token.c
T
seeseemelk 49b9db5b75 Refactor token mapping: use keyword map for tokenization instead of strcmp
- Created KeywordMap structure with keyword-to-token mapping at top of token.c
- Added lookup_keyword() function to check if identifier is a keyword
- Replaced 3 strcmp calls (lines 99-101) with single lookup_keyword() call
- Removed token_to_string() function and its tests (3 tests removed)
- Single easy-to-read and modify keyword map serves both documentation and implementation
- Added new keywords by editing the keywords[] array at top of token.c

All 12 tests passing (removed token_to_string tests which are now unnecessary).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-24 09:44:19 +02:00

171 lines
3.4 KiB
C

#include "token.h"
#include "buffer.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
/**
* Easy-to-read and modify keyword-to-token mapping.
* Add new keywords here.
*/
typedef struct {
const char* keyword;
Token token;
} KeywordMap;
static const KeywordMap keywords[] = {
{"module", TOKEN_MODULE},
{"import", TOKEN_IMPORT},
{"void", TOKEN_VOID},
};
/**
* Look up a keyword in the keyword map.
* Returns TOKEN_IDENTIFIER if not found.
*/
static Token lookup_keyword(const char* str) {
int count = sizeof(keywords) / sizeof(keywords[0]);
for (int i = 0; i < count; i++) {
if (strcmp(keywords[i].keyword, str) == 0) {
return keywords[i].token;
}
}
return TOKEN_IDENTIFIER;
}
struct TokenStream {
Buffer* buffer;
char lookahead;
int has_lookahead;
};
/**
* Check if a character is the start of an identifier.
*/
static int is_identifier_start(char c) {
return isalpha(c) || c == '_';
}
/**
* Check if a character can be part of an identifier.
*/
static int is_identifier_part(char c) {
return isalnum(c) || c == '_';
}
/**
* Read a character, using lookahead if available.
*/
static char read_char(TokenStream* ts) {
if (ts->has_lookahead) {
ts->has_lookahead = 0;
return ts->lookahead;
}
return buffer_read(ts->buffer);
}
/**
* Put a character back into the lookahead buffer.
*/
static void unread_char(TokenStream* ts, char c) {
if (c != (char)-1) {
ts->lookahead = c;
ts->has_lookahead = 1;
}
}
/**
* Try to read a keyword or identifier starting with the given character.
* Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
*/
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
char buffer[256];
int index = 0;
buffer[index++] = first;
char c;
while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
if (index < 255) {
buffer[index++] = c;
}
}
/* Put back the character that ended the identifier */
unread_char(ts, c);
buffer[index] = '\0';
/* Check for keywords */
return lookup_keyword(buffer);
}
TokenStream* tokenstream_open(Buffer* buffer) {
if (buffer == NULL) return NULL;
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
if (ts == NULL) {
return NULL;
}
ts->buffer = buffer;
ts->lookahead = 0;
ts->has_lookahead = 0;
return ts;
}
void tokenstream_close(TokenStream* ts) {
if (ts == NULL) return;
buffer_close(ts->buffer);
free(ts);
}
Token tokenstream_next(TokenStream* ts) {
if (ts == NULL || ts->buffer == NULL) return -1;
char c;
/* Skip whitespace and comments */
while ((c = read_char(ts)) != (char)-1) {
if (isspace(c)) {
continue;
}
/* Handle comments */
if (c == '/') {
char next = read_char(ts);
if (next == '/') {
/* Skip until end of line */
while ((c = read_char(ts)) != (char)-1 && c != '\n') {
/* Skip */
}
continue;
}
/* Put back the character after / */
unread_char(ts, next);
return -1;
}
/* We found a non-whitespace, non-comment character */
break;
}
if (c == (char)-1) return -1; /* EOF */
/* Single-character tokens */
switch (c) {
case '(': return TOKEN_PARENT_OPEN;
case ')': return TOKEN_PARENT_CLOSE;
case '[': return TOKEN_BRACKET_OPEN;
case ']': return TOKEN_BRACKET_CLOSE;
case ',': return TOKEN_COMMA;
case ';': return TOKEN_SEMICOLON;
}
/* Keywords and identifiers */
if (is_identifier_start(c)) {
return read_keyword_or_identifier(ts, c);
}
/* Unknown character */
return -1;
}