Implement tokenstream_info and refactor TokenStream interface
This commit is contained in:
+62
-57
@@ -1,9 +1,14 @@
|
||||
#include "token.h"
|
||||
#include "buffer.h"
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
||||
struct TokenStream {
|
||||
const char* code;
|
||||
size_t pos;
|
||||
TokenInfo last_info;
|
||||
};
|
||||
|
||||
/**
|
||||
* Easy-to-read and modify keyword-to-token mapping.
|
||||
* Add new keywords here.
|
||||
@@ -23,22 +28,17 @@ static const KeywordMap keywords[] = {
|
||||
* Look up a keyword in the keyword map.
|
||||
* Returns TOKEN_IDENTIFIER if not found.
|
||||
*/
|
||||
static Token lookup_keyword(const char* str) {
|
||||
static Token lookup_keyword(const char* str, size_t length) {
|
||||
int count = sizeof(keywords) / sizeof(keywords[0]);
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (strcmp(keywords[i].keyword, str) == 0) {
|
||||
if (strlen(keywords[i].keyword) == length &&
|
||||
strncmp(keywords[i].keyword, str, length) == 0) {
|
||||
return keywords[i].token;
|
||||
}
|
||||
}
|
||||
return TOKEN_IDENTIFIER;
|
||||
}
|
||||
|
||||
struct TokenStream {
|
||||
Buffer* buffer;
|
||||
char lookahead;
|
||||
int has_lookahead;
|
||||
};
|
||||
|
||||
/**
|
||||
* Check if a character is the start of an identifier.
|
||||
*/
|
||||
@@ -54,72 +54,63 @@ static int is_identifier_part(char c) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a character, using lookahead if available.
|
||||
* Read a character from the stream.
|
||||
*/
|
||||
static char read_char(TokenStream* ts) {
|
||||
if (ts->has_lookahead) {
|
||||
ts->has_lookahead = 0;
|
||||
return ts->lookahead;
|
||||
}
|
||||
return buffer_read(ts->buffer);
|
||||
char c = ts->code[ts->pos];
|
||||
if (c == '\0') return (char)-1;
|
||||
ts->pos++;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Put a character back into the lookahead buffer.
|
||||
* Peek at the next character in the stream.
|
||||
*/
|
||||
static void unread_char(TokenStream* ts, char c) {
|
||||
if (c != (char)-1) {
|
||||
ts->lookahead = c;
|
||||
ts->has_lookahead = 1;
|
||||
}
|
||||
static char peek_char(TokenStream* ts) {
|
||||
char c = ts->code[ts->pos];
|
||||
if (c == '\0') return (char)-1;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to read a keyword or identifier starting with the given character.
|
||||
* Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
|
||||
*/
|
||||
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
|
||||
char buffer[256];
|
||||
int index = 0;
|
||||
buffer[index++] = first;
|
||||
const char* start = &ts->code[ts->pos - 1];
|
||||
size_t length = 1;
|
||||
|
||||
char c;
|
||||
while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
|
||||
if (index < 255) {
|
||||
buffer[index++] = c;
|
||||
}
|
||||
while (is_identifier_part(peek_char(ts))) {
|
||||
read_char(ts);
|
||||
length++;
|
||||
}
|
||||
|
||||
/* Put back the character that ended the identifier */
|
||||
unread_char(ts, c);
|
||||
buffer[index] = '\0';
|
||||
|
||||
/* Check for keywords */
|
||||
return lookup_keyword(buffer);
|
||||
Token token = lookup_keyword(start, length);
|
||||
ts->last_info.token = token;
|
||||
ts->last_info.text = (char*)start;
|
||||
ts->last_info.text_length = length;
|
||||
return token;
|
||||
}
|
||||
|
||||
TokenStream* tokenstream_open(Buffer* buffer) {
|
||||
if (buffer == NULL) return NULL;
|
||||
TokenStream* tokenstream_open(const char* code) {
|
||||
if (code == NULL) return NULL;
|
||||
|
||||
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
|
||||
if (ts == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ts->buffer = buffer;
|
||||
ts->lookahead = 0;
|
||||
ts->has_lookahead = 0;
|
||||
ts->code = code;
|
||||
ts->pos = 0;
|
||||
ts->last_info.text = NULL;
|
||||
ts->last_info.text_length = 0;
|
||||
ts->last_info.token = (Token)-1;
|
||||
return ts;
|
||||
}
|
||||
|
||||
void tokenstream_close(TokenStream* ts) {
|
||||
if (ts == NULL) return;
|
||||
buffer_close(ts->buffer);
|
||||
free(ts);
|
||||
}
|
||||
|
||||
Token tokenstream_next(TokenStream* ts) {
|
||||
if (ts == NULL || ts->buffer == NULL) return -1;
|
||||
if (ts == NULL) return -1;
|
||||
|
||||
char c;
|
||||
|
||||
@@ -131,16 +122,14 @@ Token tokenstream_next(TokenStream* ts) {
|
||||
|
||||
/* Handle comments */
|
||||
if (c == '/') {
|
||||
char next = read_char(ts);
|
||||
if (next == '/') {
|
||||
if (peek_char(ts) == '/') {
|
||||
/* Skip until end of line */
|
||||
while ((c = read_char(ts)) != (char)-1 && c != '\n') {
|
||||
/* Skip */
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* Put back the character after / */
|
||||
unread_char(ts, next);
|
||||
/* It's just a slash, which we don't handle yet */
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -148,16 +137,24 @@ Token tokenstream_next(TokenStream* ts) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == (char)-1) return -1; /* EOF */
|
||||
if (c == (char)-1) {
|
||||
ts->last_info.token = (Token)-1;
|
||||
ts->last_info.text = NULL;
|
||||
ts->last_info.text_length = 0;
|
||||
return -1; /* EOF */
|
||||
}
|
||||
|
||||
/* Single-character tokens */
|
||||
ts->last_info.text = (char*)&ts->code[ts->pos - 1];
|
||||
ts->last_info.text_length = 1;
|
||||
|
||||
switch (c) {
|
||||
case '(': return TOKEN_PARENT_OPEN;
|
||||
case ')': return TOKEN_PARENT_CLOSE;
|
||||
case '[': return TOKEN_BRACKET_OPEN;
|
||||
case ']': return TOKEN_BRACKET_CLOSE;
|
||||
case ',': return TOKEN_COMMA;
|
||||
case ';': return TOKEN_SEMICOLON;
|
||||
case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
|
||||
case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
|
||||
case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
|
||||
case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
|
||||
case ',': return ts->last_info.token = TOKEN_COMMA;
|
||||
case ';': return ts->last_info.token = TOKEN_SEMICOLON;
|
||||
}
|
||||
|
||||
/* Keywords and identifiers */
|
||||
@@ -166,5 +163,13 @@ Token tokenstream_next(TokenStream* ts) {
|
||||
}
|
||||
|
||||
/* Unknown character */
|
||||
ts->last_info.token = (Token)-1;
|
||||
ts->last_info.text = NULL;
|
||||
ts->last_info.text_length = 0;
|
||||
return -1;
|
||||
}
|
||||
|
||||
void tokenstream_info(TokenStream* ts, TokenInfo* info) {
|
||||
if (ts == NULL || info == NULL) return;
|
||||
*info = ts->last_info;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user