c2/v0/token.c

#include "token.h"
#include "buffer.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

/**
 * Easy-to-read and modify keyword-to-token mapping.
 * Add new keywords here.
 */
typedef struct {
	const char* keyword;
	Token token;
} KeywordMap;

static const KeywordMap keywords[] = {
	{"module", TOKEN_MODULE},
	{"import", TOKEN_IMPORT},
	{"void", TOKEN_VOID},
};

/**
 * Look up a keyword in the keyword map.
 * Returns TOKEN_IDENTIFIER if not found.
 */
static Token lookup_keyword(const char* str) {
	int count = sizeof(keywords) / sizeof(keywords[0]);
	for (int i = 0; i < count; i++) {
		if (strcmp(keywords[i].keyword, str) == 0) {
			return keywords[i].token;
		}
	}
	return TOKEN_IDENTIFIER;
}

struct TokenStream {
	Buffer* buffer;
	char lookahead;
	int has_lookahead;
};

/**
 * Check if a character is the start of an identifier.
 */
static int is_identifier_start(char c) {
	return isalpha(c) || c == '_';
}

/**
 * Check if a character can be part of an identifier.
 */
static int is_identifier_part(char c) {
	return isalnum(c) || c == '_';
}

/**
 * Read a character, using lookahead if available.
 */
static char read_char(TokenStream* ts) {
	if (ts->has_lookahead) {
		ts->has_lookahead = 0;
		return ts->lookahead;
	}
	return buffer_read(ts->buffer);
}

/**
 * Put a character back into the lookahead buffer.
 */
static void unread_char(TokenStream* ts, char c) {
	if (c != (char)-1) {
		ts->lookahead = c;
		ts->has_lookahead = 1;
	}
}

/**
 * Try to read a keyword or identifier starting with the given character.
 * Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
 */
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
	char buffer[256];
	int index = 0;
	buffer[index++] = first;

	char c;
	while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
		if (index < 255) {
			buffer[index++] = c;
		}
	}

	/* Put back the character that ended the identifier */
	unread_char(ts, c);
	buffer[index] = '\0';

	/* Check for keywords */
	return lookup_keyword(buffer);
}

TokenStream* tokenstream_open(Buffer* buffer) {
	if (buffer == NULL) return NULL;

	TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
	if (ts == NULL) {
		return NULL;
	}

	ts->buffer = buffer;
	ts->lookahead = 0;
	ts->has_lookahead = 0;
	return ts;
}

void tokenstream_close(TokenStream* ts) {
	if (ts == NULL) return;
	buffer_close(ts->buffer);
	free(ts);
}

Token tokenstream_next(TokenStream* ts) {
	if (ts == NULL || ts->buffer == NULL) return -1;

	char c;

	/* Skip whitespace and comments */
	while ((c = read_char(ts)) != (char)-1) {
		if (isspace(c)) {
			continue;
		}

		/* Handle comments */
		if (c == '/') {
			char next = read_char(ts);
			if (next == '/') {
				/* Skip until end of line */
				while ((c = read_char(ts)) != (char)-1 && c != '\n') {
					/* Skip */
				}
				continue;
			}
			/* Put back the character after / */
			unread_char(ts, next);
			return -1;
		}

		/* We found a non-whitespace, non-comment character */
		break;
	}

	if (c == (char)-1) return -1; /* EOF */

	/* Single-character tokens */
	switch (c) {
	case '(': return TOKEN_PARENT_OPEN;
	case ')': return TOKEN_PARENT_CLOSE;
	case '[': return TOKEN_BRACKET_OPEN;
	case ']': return TOKEN_BRACKET_CLOSE;
	case ',': return TOKEN_COMMA;
	case ';': return TOKEN_SEMICOLON;
	}

	/* Keywords and identifiers */
	if (is_identifier_start(c)) {
		return read_keyword_or_identifier(ts, c);
	}

	/* Unknown character */
	return -1;
}