c2/v0/token.c

#include "token.h"
#include "buffer.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

/**
 * Easy-to-read and modify token-to-string mapping.
 * Order must match the Token enum in token.h.
 */
static const char* token_names[] = {
	"module",
	"import",
	"semicolon",
	"paren_open",
	"paren_close",
	"bracket_open",
	"bracket_close",
	"comma",
	"void",
	"identifier",
};

struct TokenStream {
	Buffer* buffer;
	char lookahead;
	int has_lookahead;
};

/**
 * Convert a Token enum to its string representation.
 * @param token The token to convert.
 * @returns The string name of the token.
 */
const char* token_to_string(Token token) {
	int count = sizeof(token_names) / sizeof(token_names[0]);
	if (token >= 0 && token < count) {
		return token_names[token];
	}
	return "unknown";
}

/**
 * Check if a character is the start of an identifier.
 */
static int is_identifier_start(char c) {
	return isalpha(c) || c == '_';
}

/**
 * Check if a character can be part of an identifier.
 */
static int is_identifier_part(char c) {
	return isalnum(c) || c == '_';
}

/**
 * Read a character, using lookahead if available.
 */
static char read_char(TokenStream* ts) {
	if (ts->has_lookahead) {
		ts->has_lookahead = 0;
		return ts->lookahead;
	}
	return buffer_read(ts->buffer);
}

/**
 * Put a character back into the lookahead buffer.
 */
static void unread_char(TokenStream* ts, char c) {
	if (c != (char)-1) {
		ts->lookahead = c;
		ts->has_lookahead = 1;
	}
}

/**
 * Try to read a keyword or identifier starting with the given character.
 * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
 */
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
	char buffer[256];
	int index = 0;
	buffer[index++] = first;

	char c;
	while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
		if (index < 255) {
			buffer[index++] = c;
		}
	}

	/* Put back the character that ended the identifier */
	unread_char(ts, c);
	buffer[index] = '\0';

	/* Check for keywords */
	if (strcmp(buffer, "module") == 0) return TOKEN_MODULE;
	if (strcmp(buffer, "import") == 0) return TOKEN_IMPORT;
	if (strcmp(buffer, "void") == 0) return TOKEN_VOID;

	return TOKEN_IDENTIFIER;
}

TokenStream* tokenstream_open(const char* path) {
	if (path == NULL) return NULL;

	Buffer* buf = buffer_open_file(path);
	if (buf == NULL) return NULL;

	TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
	if (ts == NULL) {
		buffer_close(buf);
		return NULL;
	}

	ts->buffer = buf;
	ts->lookahead = 0;
	ts->has_lookahead = 0;
	return ts;
}

void tokenstream_close(TokenStream* ts) {
	if (ts == NULL) return;
	buffer_close(ts->buffer);
	free(ts);
}

Token tokenstream_next(TokenStream* ts) {
	if (ts == NULL || ts->buffer == NULL) return -1;

	char c;

	/* Skip whitespace and comments */
	while ((c = read_char(ts)) != (char)-1) {
		if (isspace(c)) {
			continue;
		}

		/* Handle comments */
		if (c == '/') {
			char next = read_char(ts);
			if (next == '/') {
				/* Skip until end of line */
				while ((c = read_char(ts)) != (char)-1 && c != '\n') {
					/* Skip */
				}
				continue;
			}
			/* Put back the character after / */
			unread_char(ts, next);
			return -1;
		}

		/* We found a non-whitespace, non-comment character */
		break;
	}

	if (c == (char)-1) return -1; /* EOF */

	/* Single-character tokens */
	switch (c) {
	case '(': return TOKEN_PARENT_OPEN;
	case ')': return TOKEN_PARENT_CLOSE;
	case '[': return TOKEN_BRACKET_OPEN;
	case ']': return TOKEN_BRACKET_CLOSE;
	case ',': return TOKEN_COMMA;
	case ';': return TOKEN_SEMICOLON;
	}

	/* Keywords and identifiers */
	if (is_identifier_start(c)) {
		return read_keyword_or_identifier(ts, c);
	}

	/* Unknown character */
	return -1;
}