Files
c2/v0/token.c
T
2026-04-29 20:15:05 +02:00

255 lines
6.3 KiB
C

#include "token.h"
#include "log.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
struct TokenStream {
char* filename;
const char* code;
size_t pos;
int line;
int column;
const char* line_start;
/* End of last non-EOF token */
int last_line;
int last_column_end;
const char* last_line_start;
};
/**
* Easy-to-read and modify keyword-to-token mapping.
* Add new keywords here.
*/
typedef struct {
const char* keyword;
TokenType token;
} KeywordMap;
static const KeywordMap keywords[] = {
{"module", TOKEN_MODULE},
{"import", TOKEN_IMPORT},
{"alias", TOKEN_ALIAS},
{"public", TOKEN_PUBLIC},
{"void", TOKEN_VOID},
{"i8", TOKEN_I8},
{"i16", TOKEN_I16},
{"i32", TOKEN_I32},
{"i64", TOKEN_I64},
{"u8", TOKEN_U8},
{"u16", TOKEN_U16},
{"u32", TOKEN_U32},
{"u64", TOKEN_U64},
};
/**
* Look up a keyword in the keyword map.
* Returns TOKEN_IDENTIFIER if not found.
*/
static TokenType lookup_keyword(const char* str, size_t length) {
int count = sizeof(keywords) / sizeof(keywords[0]);
int i;
for (i = 0; i < count; i++) {
if (strlen(keywords[i].keyword) == length &&
strncmp(keywords[i].keyword, str, length) == 0) {
return keywords[i].token;
}
}
return TOKEN_IDENTIFIER;
}
/**
* Check if a character is the start of an identifier.
*/
static int is_identifier_start(char c) {
return isalpha(c) || c == '_';
}
/**
* Check if a character can be part of an identifier.
*/
static int is_identifier_part(char c) {
return isalnum(c) || c == '_';
}
/**
* Peek at the next character in the stream.
*/
static char peek_char(TokenStream* ts) {
return ts->code[ts->pos];
}
/**
* Read a character from the stream and update position.
*/
static char read_char(TokenStream* ts) {
char c = ts->code[ts->pos];
if (c == '\0') return '\0';
ts->pos++;
if (c == '\n') {
ts->line++;
ts->column = 1;
ts->line_start = &ts->code[ts->pos];
} else {
ts->column++;
}
return c;
}
static size_t get_line_length(const char* line_start) {
const char* p = line_start;
while (*p != '\n' && *p != '\0') {
p++;
}
return (size_t)(p - line_start);
}
static Token create_token(TokenStream* ts, TokenType type, const char* text, size_t length, int line, int column, const char* line_start) {
Token t;
t.token = type;
t.text.data = (char*)text;
t.text.length = length;
t.location.filename = ts->filename;
t.location.line = line;
t.location.column_start = column;
t.location.column_end = column + (int)length - 1;
t.location.line_text.data = (char*)line_start;
t.location.line_text.length = get_line_length(line_start);
if (type != TOKEN_EOF) {
ts->last_line = t.location.line;
ts->last_column_end = t.location.column_end;
ts->last_line_start = t.location.line_text.data;
}
return t;
}
TokenStream* tokenstream_open(const char* filename, const char* code) {
/* Declarations first for C89 */
TokenStream* ts;
const char* name_src;
if (code == NULL) return NULL;
ts = (TokenStream*)malloc(sizeof(struct TokenStream));
if (ts == NULL) {
return NULL;
}
name_src = filename ? filename : "unknown";
ts->filename = malloc(strlen(name_src) + 1);
if (ts->filename) {
memcpy(ts->filename, name_src, strlen(name_src) + 1);
}
ts->code = code;
ts->pos = 0;
ts->line = 1;
ts->column = 1;
ts->line_start = code;
ts->last_line = 1;
ts->last_column_end = 0;
ts->last_line_start = code;
return ts;
}
void tokenstream_close(TokenStream* ts) {
if (ts == NULL) return;
if (ts->filename) free(ts->filename);
free(ts);
}
Token tokenstream_next(TokenStream* ts) {
/* Declarations first for C89 */
char c;
int start_line;
int start_column;
const char* line_start;
const char* start_text;
Token t;
if (ts == NULL) {
Token t = {0};
t.token = TOKEN_EOF;
return t;
}
/* Skip whitespace and comments */
while ((c = peek_char(ts)) != '\0') {
if (isspace(c)) {
read_char(ts);
continue;
}
/* Handle comments */
if (c == '/') {
if (ts->code[ts->pos + 1] == '/') {
/* Skip until end of line */
while ((c = read_char(ts)) != '\0' && c != '\n') {
/* Skip */
}
continue;
}
/* It's just a slash, which we don't handle yet */
break;
}
/* We found a non-whitespace, non-comment character */
break;
}
if (peek_char(ts) == '\0') {
Token t;
t.token = TOKEN_EOF;
t.text.data = NULL;
t.text.length = 0;
t.location.filename = ts->filename;
t.location.line = ts->last_line;
t.location.column_start = ts->last_column_end + 1;
t.location.column_end = ts->last_column_end + 1;
t.location.line_text.data = (char*)ts->last_line_start;
t.location.line_text.length = get_line_length(ts->last_line_start);
return t;
}
start_line = ts->line;
start_column = ts->column;
line_start = ts->line_start;
start_text = &ts->code[ts->pos];
c = read_char(ts);
/* Single-character tokens */
switch (c) {
case '(': return create_token(ts, TOKEN_PARENT_OPEN, start_text, 1, start_line, start_column, line_start);
case ')': return create_token(ts, TOKEN_PARENT_CLOSE, start_text, 1, start_line, start_column, line_start);
case '[': return create_token(ts, TOKEN_BRACKET_OPEN, start_text, 1, start_line, start_column, line_start);
case ']': return create_token(ts, TOKEN_BRACKET_CLOSE, start_text, 1, start_line, start_column, line_start);
case ',': return create_token(ts, TOKEN_COMMA, start_text, 1, start_line, start_column, line_start);
case ';': return create_token(ts, TOKEN_SEMICOLON, start_text, 1, start_line, start_column, line_start);
case '=': return create_token(ts, TOKEN_ASSIGN, start_text, 1, start_line, start_column, line_start);
}
/* Keywords and identifiers */
if (is_identifier_start(c)) {
/* Declarations first for C89 */
size_t length;
TokenType type;
length = 1;
while (is_identifier_part(peek_char(ts))) {
read_char(ts);
length++;
}
type = lookup_keyword(start_text, length);
return create_token(ts, type, start_text, length, start_line, start_column, line_start);
}
/* Unknown character */
t = create_token(ts, TOKEN_UNKNOWN, start_text, 1, start_line, start_column, line_start);
log_on_line(&t.location, "unexpected token '%c'", c);
return t;
}