c2/v0/token.c

#include "token.h"
#include "log.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

struct TokenStream {
    char* filename;
    const char* code;
    size_t pos;
    int line;
    int column;
    const char* line_start;

    /* End of last non-EOF token */
    int last_line;
    int last_column_end;
    const char* last_line_start;
};

/**
 * Easy-to-read and modify keyword-to-token mapping.
 * Add new keywords here.
 */
typedef struct {
	const char* keyword;
	TokenType token;
} KeywordMap;
static const KeywordMap keywords[] = {
	{"module", TOKEN_MODULE},
	{"import", TOKEN_IMPORT},
    {"alias", TOKEN_ALIAS},
    {"public", TOKEN_PUBLIC},
	{"void", TOKEN_VOID},
	{"i8", TOKEN_I8},
	{"i16", TOKEN_I16},
	{"i32", TOKEN_I32},
	{"i64", TOKEN_I64},
	{"u8", TOKEN_U8},
	{"u16", TOKEN_U16},
	{"u32", TOKEN_U32},
	{"u64", TOKEN_U64},
};

/**
 * Look up a keyword in the keyword map.
 * Returns TOKEN_IDENTIFIER if not found.
 */
static TokenType lookup_keyword(const char* str, size_t length) {
    int count = sizeof(keywords) / sizeof(keywords[0]);
    int i;
    for (i = 0; i < count; i++) {
		if (strlen(keywords[i].keyword) == length &&
		    strncmp(keywords[i].keyword, str, length) == 0) {
			return keywords[i].token;
		}
	}
	return TOKEN_IDENTIFIER;
}

/**
 * Check if a character is the start of an identifier.
 */
static int is_identifier_start(char c) {
	return isalpha(c) || c == '_';
}

/**
 * Check if a character can be part of an identifier.
 */
static int is_identifier_part(char c) {
	return isalnum(c) || c == '_';
}

/**
 * Peek at the next character in the stream.
 */
static char peek_char(TokenStream* ts) {
	return ts->code[ts->pos];
}

/**
 * Read a character from the stream and update position.
 */
static char read_char(TokenStream* ts) {
	char c = ts->code[ts->pos];
	if (c == '\0') return '\0';

	ts->pos++;
	if (c == '\n') {
		ts->line++;
		ts->column = 1;
		ts->line_start = &ts->code[ts->pos];
	} else {
		ts->column++;
	}
	return c;
}

static size_t get_line_length(const char* line_start) {
	const char* p = line_start;
	while (*p != '\n' && *p != '\0') {
		p++;
	}
	return (size_t)(p - line_start);
}

static Token create_token(TokenStream* ts, TokenType type, const char* text, size_t length, int line, int column, const char* line_start) {
	Token t;
	t.token = type;
	t.text.data = (char*)text;
	t.text.length = length;
	t.location.filename = ts->filename;
	t.location.line = line;
	t.location.column_start = column;
	t.location.column_end = column + (int)length - 1;
	t.location.line_text.data = (char*)line_start;
	t.location.line_text.length = get_line_length(line_start);

	if (type != TOKEN_EOF) {
		ts->last_line = t.location.line;
		ts->last_column_end = t.location.column_end;
		ts->last_line_start = t.location.line_text.data;
	}

	return t;
}

TokenStream* tokenstream_open(const char* filename, const char* code) {
    /* Declarations first for C89 */
    TokenStream* ts;
    const char* name_src;

    if (code == NULL) return NULL;

    ts = (TokenStream*)malloc(sizeof(struct TokenStream));
    if (ts == NULL) {
        return NULL;
    }

    name_src = filename ? filename : "unknown";
    ts->filename = malloc(strlen(name_src) + 1);
    if (ts->filename) {
        memcpy(ts->filename, name_src, strlen(name_src) + 1);
    }
    ts->code = code;
    ts->pos = 0;
    ts->line = 1;
    ts->column = 1;
    ts->line_start = code;
    ts->last_line = 1;
    ts->last_column_end = 0;
    ts->last_line_start = code;
    return ts;
}

void tokenstream_close(TokenStream* ts) {
	if (ts == NULL) return;
	if (ts->filename) free(ts->filename);
	free(ts);
}

Token tokenstream_next(TokenStream* ts) {
    /* Declarations first for C89 */
    char c;
    int start_line;
    int start_column;
    const char* line_start;
    const char* start_text;
    Token t;

    if (ts == NULL) {
        Token t = {0};
        t.token = TOKEN_EOF;
        return t;
    }

    /* Skip whitespace and comments */
    while ((c = peek_char(ts)) != '\0') {
		if (isspace(c)) {
			read_char(ts);
			continue;
		}

		/* Handle comments */
		if (c == '/') {
			if (ts->code[ts->pos + 1] == '/') {
				/* Skip until end of line */
				while ((c = read_char(ts)) != '\0' && c != '\n') {
					/* Skip */
				}
				continue;
			}
			/* It's just a slash, which we don't handle yet */
			break;
		}

		/* We found a non-whitespace, non-comment character */
		break;
	}

	if (peek_char(ts) == '\0') {
		Token t;
		t.token = TOKEN_EOF;
		t.text.data = NULL;
		t.text.length = 0;
		t.location.filename = ts->filename;

		t.location.line = ts->last_line;
		t.location.column_start = ts->last_column_end + 1;
		t.location.column_end = ts->last_column_end + 1;
		t.location.line_text.data = (char*)ts->last_line_start;
		t.location.line_text.length = get_line_length(ts->last_line_start);
		return t;
	}

    start_line = ts->line;
    start_column = ts->column;
    line_start = ts->line_start;
    start_text = &ts->code[ts->pos];

    c = read_char(ts);

	/* Single-character tokens */
	switch (c) {
	case '(': return create_token(ts, TOKEN_PARENT_OPEN, start_text, 1, start_line, start_column, line_start);
	case ')': return create_token(ts, TOKEN_PARENT_CLOSE, start_text, 1, start_line, start_column, line_start);
	case '[': return create_token(ts, TOKEN_BRACKET_OPEN, start_text, 1, start_line, start_column, line_start);
	case ']': return create_token(ts, TOKEN_BRACKET_CLOSE, start_text, 1, start_line, start_column, line_start);
	case ',': return create_token(ts, TOKEN_COMMA, start_text, 1, start_line, start_column, line_start);
	case ';': return create_token(ts, TOKEN_SEMICOLON, start_text, 1, start_line, start_column, line_start);
    case '=': return create_token(ts, TOKEN_ASSIGN, start_text, 1, start_line, start_column, line_start);
	}

	/* Keywords and identifiers */
    if (is_identifier_start(c)) {
        /* Declarations first for C89 */
        size_t length;
        TokenType type;

        length = 1;
        while (is_identifier_part(peek_char(ts))) {
            read_char(ts);
            length++;
        }
        type = lookup_keyword(start_text, length);
        return create_token(ts, type, start_text, length, start_line, start_column, line_start);
    }

	/* Unknown character */
    t = create_token(ts, TOKEN_UNKNOWN, start_text, 1, start_line, start_column, line_start);
	log_on_line(&t.location, "unexpected token '%c'", c);
	return t;
}