c2/v0/token.c

#include "token.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

struct TokenStream {
	const char* code;
	size_t pos;
	TokenInfo last_info;
};

/**
 * Easy-to-read and modify keyword-to-token mapping.
 * Add new keywords here.
 */
typedef struct {
	const char* keyword;
	Token token;
} KeywordMap;

static const KeywordMap keywords[] = {
	{"module", TOKEN_MODULE},
	{"import", TOKEN_IMPORT},
	{"void", TOKEN_VOID},
};

/**
 * Look up a keyword in the keyword map.
 * Returns TOKEN_IDENTIFIER if not found.
 */
static Token lookup_keyword(const char* str, size_t length) {
	int count = sizeof(keywords) / sizeof(keywords[0]);
	for (int i = 0; i < count; i++) {
		if (strlen(keywords[i].keyword) == length &&
		    strncmp(keywords[i].keyword, str, length) == 0) {
			return keywords[i].token;
		}
	}
	return TOKEN_IDENTIFIER;
}

/**
 * Check if a character is the start of an identifier.
 */
static int is_identifier_start(char c) {
	return isalpha(c) || c == '_';
}

/**
 * Check if a character can be part of an identifier.
 */
static int is_identifier_part(char c) {
	return isalnum(c) || c == '_';
}

/**
 * Read a character from the stream.
 */
static char read_char(TokenStream* ts) {
	char c = ts->code[ts->pos];
	if (c == '\0') return (char)-1;
	ts->pos++;
	return c;
}

/**
 * Peek at the next character in the stream.
 */
static char peek_char(TokenStream* ts) {
	char c = ts->code[ts->pos];
	if (c == '\0') return (char)-1;
	return c;
}

static Token read_keyword_or_identifier(TokenStream* ts, char first) {
	const char* start = &ts->code[ts->pos - 1];
	size_t length = 1;

	while (is_identifier_part(peek_char(ts))) {
		read_char(ts);
		length++;
	}

	Token token = lookup_keyword(start, length);
	ts->last_info.token = token;
	ts->last_info.text = (char*)start;
	ts->last_info.text_length = length;
	return token;
}

TokenStream* tokenstream_open(const char* code) {
	if (code == NULL) return NULL;

	TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
	if (ts == NULL) {
		return NULL;
	}

	ts->code = code;
	ts->pos = 0;
	ts->last_info.text = NULL;
	ts->last_info.text_length = 0;
	ts->last_info.token = (Token)-1;
	return ts;
}

void tokenstream_close(TokenStream* ts) {
	if (ts == NULL) return;
	free(ts);
}

Token tokenstream_next(TokenStream* ts) {
	if (ts == NULL) return -1;

	char c;

	/* Skip whitespace and comments */
	while ((c = read_char(ts)) != (char)-1) {
		if (isspace(c)) {
			continue;
		}

		/* Handle comments */
		if (c == '/') {
			if (peek_char(ts) == '/') {
				/* Skip until end of line */
				while ((c = read_char(ts)) != (char)-1 && c != '\n') {
					/* Skip */
				}
				continue;
			}
			/* It's just a slash, which we don't handle yet */
			return -1;
		}

		/* We found a non-whitespace, non-comment character */
		break;
	}

	if (c == (char)-1) {
		ts->last_info.token = (Token)-1;
		ts->last_info.text = NULL;
		ts->last_info.text_length = 0;
		return -1; /* EOF */
	}

	/* Single-character tokens */
	ts->last_info.text = (char*)&ts->code[ts->pos - 1];
	ts->last_info.text_length = 1;

	switch (c) {
	case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
	case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
	case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
	case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
	case ',': return ts->last_info.token = TOKEN_COMMA;
	case ';': return ts->last_info.token = TOKEN_SEMICOLON;
	}

	/* Keywords and identifiers */
	if (is_identifier_start(c)) {
		return read_keyword_or_identifier(ts, c);
	}

	/* Unknown character */
	ts->last_info.token = (Token)-1;
	ts->last_info.text = NULL;
	ts->last_info.text_length = 0;
	return -1;
}

void tokenstream_info(TokenStream* ts, TokenInfo* info) {
	if (ts == NULL || info == NULL) return;
	*info = ts->last_info;
}