Implement tokenstream_info and refactor TokenStream interface
This commit is contained in:
@@ -35,6 +35,7 @@ static TestCase s_tests[] = {
|
|||||||
{"tokenstream_comma", test_tokenstream_comma},
|
{"tokenstream_comma", test_tokenstream_comma},
|
||||||
{"tokenstream_whitespace_ignored", test_tokenstream_whitespace_ignored},
|
{"tokenstream_whitespace_ignored", test_tokenstream_whitespace_ignored},
|
||||||
{"tokenstream_void_function_signature", test_tokenstream_void_function_signature},
|
{"tokenstream_void_function_signature", test_tokenstream_void_function_signature},
|
||||||
|
{"tokenstream_info", test_tokenstream_info},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+29
-12
@@ -1,5 +1,6 @@
|
|||||||
#include "test.h"
|
#include "test.h"
|
||||||
#include "token.h"
|
#include "token.h"
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
static void test_tokenstream_open_fail(void) {
|
static void test_tokenstream_open_fail(void) {
|
||||||
TokenStream* ts = tokenstream_open(NULL);
|
TokenStream* ts = tokenstream_open(NULL);
|
||||||
@@ -7,8 +8,7 @@ static void test_tokenstream_open_fail(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_tokenstream_simple_keyword(void) {
|
static void test_tokenstream_simple_keyword(void) {
|
||||||
Buffer* buf = buffer_open_string("module");
|
TokenStream* ts = tokenstream_open("module");
|
||||||
TokenStream* ts = tokenstream_open(buf);
|
|
||||||
|
|
||||||
Token t = tokenstream_next(ts);
|
Token t = tokenstream_next(ts);
|
||||||
if (t != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
if (t != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
||||||
@@ -20,8 +20,7 @@ static void test_tokenstream_simple_keyword(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_tokenstream_keywords_and_symbols(void) {
|
static void test_tokenstream_keywords_and_symbols(void) {
|
||||||
Buffer* buf = buffer_open_string("module main; import stdio;");
|
TokenStream* ts = tokenstream_open("module main; import stdio;");
|
||||||
TokenStream* ts = tokenstream_open(buf);
|
|
||||||
|
|
||||||
if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
||||||
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER (main)");
|
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER (main)");
|
||||||
@@ -35,8 +34,7 @@ static void test_tokenstream_keywords_and_symbols(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_tokenstream_parentheses_and_brackets(void) {
|
static void test_tokenstream_parentheses_and_brackets(void) {
|
||||||
Buffer* buf = buffer_open_string("()[]");
|
TokenStream* ts = tokenstream_open("()[]");
|
||||||
TokenStream* ts = tokenstream_open(buf);
|
|
||||||
|
|
||||||
if (tokenstream_next(ts) != TOKEN_PARENT_OPEN) fail("expected TOKEN_PARENT_OPEN");
|
if (tokenstream_next(ts) != TOKEN_PARENT_OPEN) fail("expected TOKEN_PARENT_OPEN");
|
||||||
if (tokenstream_next(ts) != TOKEN_PARENT_CLOSE) fail("expected TOKEN_PARENT_CLOSE");
|
if (tokenstream_next(ts) != TOKEN_PARENT_CLOSE) fail("expected TOKEN_PARENT_CLOSE");
|
||||||
@@ -48,8 +46,7 @@ static void test_tokenstream_parentheses_and_brackets(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_tokenstream_comma(void) {
|
static void test_tokenstream_comma(void) {
|
||||||
Buffer* buf = buffer_open_string("a,b,c");
|
TokenStream* ts = tokenstream_open("a,b,c");
|
||||||
TokenStream* ts = tokenstream_open(buf);
|
|
||||||
|
|
||||||
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected a");
|
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected a");
|
||||||
if (tokenstream_next(ts) != TOKEN_COMMA) fail("expected comma");
|
if (tokenstream_next(ts) != TOKEN_COMMA) fail("expected comma");
|
||||||
@@ -62,8 +59,7 @@ static void test_tokenstream_comma(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_tokenstream_whitespace_ignored(void) {
|
static void test_tokenstream_whitespace_ignored(void) {
|
||||||
Buffer* buf = buffer_open_string(" module \n\t import ; ");
|
TokenStream* ts = tokenstream_open(" module \n\t import ; ");
|
||||||
TokenStream* ts = tokenstream_open(buf);
|
|
||||||
|
|
||||||
if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
if (tokenstream_next(ts) != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
||||||
if (tokenstream_next(ts) != TOKEN_IMPORT) fail("expected TOKEN_IMPORT");
|
if (tokenstream_next(ts) != TOKEN_IMPORT) fail("expected TOKEN_IMPORT");
|
||||||
@@ -74,8 +70,7 @@ static void test_tokenstream_whitespace_ignored(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void test_tokenstream_void_function_signature(void) {
|
static void test_tokenstream_void_function_signature(void) {
|
||||||
Buffer* buf = buffer_open_string("void main()");
|
TokenStream* ts = tokenstream_open("void main()");
|
||||||
TokenStream* ts = tokenstream_open(buf);
|
|
||||||
|
|
||||||
if (tokenstream_next(ts) != TOKEN_VOID) fail("expected TOKEN_VOID");
|
if (tokenstream_next(ts) != TOKEN_VOID) fail("expected TOKEN_VOID");
|
||||||
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
|
if (tokenstream_next(ts) != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
|
||||||
@@ -85,3 +80,25 @@ static void test_tokenstream_void_function_signature(void) {
|
|||||||
|
|
||||||
tokenstream_close(ts);
|
tokenstream_close(ts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_tokenstream_info(void) {
|
||||||
|
TokenStream* ts = tokenstream_open("module main;");
|
||||||
|
|
||||||
|
Token t1 = tokenstream_next(ts);
|
||||||
|
TokenInfo info1;
|
||||||
|
tokenstream_info(ts, &info1);
|
||||||
|
if (t1 != TOKEN_MODULE) fail("expected TOKEN_MODULE");
|
||||||
|
if (info1.token != TOKEN_MODULE) fail("info: expected TOKEN_MODULE");
|
||||||
|
if (info1.text_length != 6) fail("info: expected length 6");
|
||||||
|
if (strncmp(info1.text, "module", 6) != 0) fail("info: expected 'module'");
|
||||||
|
|
||||||
|
Token t2 = tokenstream_next(ts);
|
||||||
|
TokenInfo info2;
|
||||||
|
tokenstream_info(ts, &info2);
|
||||||
|
if (t2 != TOKEN_IDENTIFIER) fail("expected TOKEN_IDENTIFIER");
|
||||||
|
if (info2.token != TOKEN_IDENTIFIER) fail("info: expected TOKEN_IDENTIFIER");
|
||||||
|
if (info2.text_length != 4) fail("info: expected length 4");
|
||||||
|
if (strncmp(info2.text, "main", 4) != 0) fail("info: expected 'main'");
|
||||||
|
|
||||||
|
tokenstream_close(ts);
|
||||||
|
}
|
||||||
|
|||||||
+62
-57
@@ -1,9 +1,14 @@
|
|||||||
#include "token.h"
|
#include "token.h"
|
||||||
#include "buffer.h"
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
struct TokenStream {
|
||||||
|
const char* code;
|
||||||
|
size_t pos;
|
||||||
|
TokenInfo last_info;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Easy-to-read and modify keyword-to-token mapping.
|
* Easy-to-read and modify keyword-to-token mapping.
|
||||||
* Add new keywords here.
|
* Add new keywords here.
|
||||||
@@ -23,22 +28,17 @@ static const KeywordMap keywords[] = {
|
|||||||
* Look up a keyword in the keyword map.
|
* Look up a keyword in the keyword map.
|
||||||
* Returns TOKEN_IDENTIFIER if not found.
|
* Returns TOKEN_IDENTIFIER if not found.
|
||||||
*/
|
*/
|
||||||
static Token lookup_keyword(const char* str) {
|
static Token lookup_keyword(const char* str, size_t length) {
|
||||||
int count = sizeof(keywords) / sizeof(keywords[0]);
|
int count = sizeof(keywords) / sizeof(keywords[0]);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
if (strcmp(keywords[i].keyword, str) == 0) {
|
if (strlen(keywords[i].keyword) == length &&
|
||||||
|
strncmp(keywords[i].keyword, str, length) == 0) {
|
||||||
return keywords[i].token;
|
return keywords[i].token;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return TOKEN_IDENTIFIER;
|
return TOKEN_IDENTIFIER;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct TokenStream {
|
|
||||||
Buffer* buffer;
|
|
||||||
char lookahead;
|
|
||||||
int has_lookahead;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if a character is the start of an identifier.
|
* Check if a character is the start of an identifier.
|
||||||
*/
|
*/
|
||||||
@@ -54,72 +54,63 @@ static int is_identifier_part(char c) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read a character, using lookahead if available.
|
* Read a character from the stream.
|
||||||
*/
|
*/
|
||||||
static char read_char(TokenStream* ts) {
|
static char read_char(TokenStream* ts) {
|
||||||
if (ts->has_lookahead) {
|
char c = ts->code[ts->pos];
|
||||||
ts->has_lookahead = 0;
|
if (c == '\0') return (char)-1;
|
||||||
return ts->lookahead;
|
ts->pos++;
|
||||||
}
|
return c;
|
||||||
return buffer_read(ts->buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Put a character back into the lookahead buffer.
|
* Peek at the next character in the stream.
|
||||||
*/
|
*/
|
||||||
static void unread_char(TokenStream* ts, char c) {
|
static char peek_char(TokenStream* ts) {
|
||||||
if (c != (char)-1) {
|
char c = ts->code[ts->pos];
|
||||||
ts->lookahead = c;
|
if (c == '\0') return (char)-1;
|
||||||
ts->has_lookahead = 1;
|
return c;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Try to read a keyword or identifier starting with the given character.
|
|
||||||
* Returns the token type, or TOKEN_IDENTIFIER if it doesn't match a keyword.
|
|
||||||
*/
|
|
||||||
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
|
static Token read_keyword_or_identifier(TokenStream* ts, char first) {
|
||||||
char buffer[256];
|
const char* start = &ts->code[ts->pos - 1];
|
||||||
int index = 0;
|
size_t length = 1;
|
||||||
buffer[index++] = first;
|
|
||||||
|
|
||||||
char c;
|
while (is_identifier_part(peek_char(ts))) {
|
||||||
while ((c = read_char(ts)) != (char)-1 && is_identifier_part(c)) {
|
read_char(ts);
|
||||||
if (index < 255) {
|
length++;
|
||||||
buffer[index++] = c;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Put back the character that ended the identifier */
|
Token token = lookup_keyword(start, length);
|
||||||
unread_char(ts, c);
|
ts->last_info.token = token;
|
||||||
buffer[index] = '\0';
|
ts->last_info.text = (char*)start;
|
||||||
|
ts->last_info.text_length = length;
|
||||||
/* Check for keywords */
|
return token;
|
||||||
return lookup_keyword(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenStream* tokenstream_open(Buffer* buffer) {
|
TokenStream* tokenstream_open(const char* code) {
|
||||||
if (buffer == NULL) return NULL;
|
if (code == NULL) return NULL;
|
||||||
|
|
||||||
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
|
TokenStream* ts = (TokenStream*)malloc(sizeof(struct TokenStream));
|
||||||
if (ts == NULL) {
|
if (ts == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ts->buffer = buffer;
|
ts->code = code;
|
||||||
ts->lookahead = 0;
|
ts->pos = 0;
|
||||||
ts->has_lookahead = 0;
|
ts->last_info.text = NULL;
|
||||||
|
ts->last_info.text_length = 0;
|
||||||
|
ts->last_info.token = (Token)-1;
|
||||||
return ts;
|
return ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
void tokenstream_close(TokenStream* ts) {
|
void tokenstream_close(TokenStream* ts) {
|
||||||
if (ts == NULL) return;
|
if (ts == NULL) return;
|
||||||
buffer_close(ts->buffer);
|
|
||||||
free(ts);
|
free(ts);
|
||||||
}
|
}
|
||||||
|
|
||||||
Token tokenstream_next(TokenStream* ts) {
|
Token tokenstream_next(TokenStream* ts) {
|
||||||
if (ts == NULL || ts->buffer == NULL) return -1;
|
if (ts == NULL) return -1;
|
||||||
|
|
||||||
char c;
|
char c;
|
||||||
|
|
||||||
@@ -131,16 +122,14 @@ Token tokenstream_next(TokenStream* ts) {
|
|||||||
|
|
||||||
/* Handle comments */
|
/* Handle comments */
|
||||||
if (c == '/') {
|
if (c == '/') {
|
||||||
char next = read_char(ts);
|
if (peek_char(ts) == '/') {
|
||||||
if (next == '/') {
|
|
||||||
/* Skip until end of line */
|
/* Skip until end of line */
|
||||||
while ((c = read_char(ts)) != (char)-1 && c != '\n') {
|
while ((c = read_char(ts)) != (char)-1 && c != '\n') {
|
||||||
/* Skip */
|
/* Skip */
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* Put back the character after / */
|
/* It's just a slash, which we don't handle yet */
|
||||||
unread_char(ts, next);
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,16 +137,24 @@ Token tokenstream_next(TokenStream* ts) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c == (char)-1) return -1; /* EOF */
|
if (c == (char)-1) {
|
||||||
|
ts->last_info.token = (Token)-1;
|
||||||
|
ts->last_info.text = NULL;
|
||||||
|
ts->last_info.text_length = 0;
|
||||||
|
return -1; /* EOF */
|
||||||
|
}
|
||||||
|
|
||||||
/* Single-character tokens */
|
/* Single-character tokens */
|
||||||
|
ts->last_info.text = (char*)&ts->code[ts->pos - 1];
|
||||||
|
ts->last_info.text_length = 1;
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case '(': return TOKEN_PARENT_OPEN;
|
case '(': return ts->last_info.token = TOKEN_PARENT_OPEN;
|
||||||
case ')': return TOKEN_PARENT_CLOSE;
|
case ')': return ts->last_info.token = TOKEN_PARENT_CLOSE;
|
||||||
case '[': return TOKEN_BRACKET_OPEN;
|
case '[': return ts->last_info.token = TOKEN_BRACKET_OPEN;
|
||||||
case ']': return TOKEN_BRACKET_CLOSE;
|
case ']': return ts->last_info.token = TOKEN_BRACKET_CLOSE;
|
||||||
case ',': return TOKEN_COMMA;
|
case ',': return ts->last_info.token = TOKEN_COMMA;
|
||||||
case ';': return TOKEN_SEMICOLON;
|
case ';': return ts->last_info.token = TOKEN_SEMICOLON;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Keywords and identifiers */
|
/* Keywords and identifiers */
|
||||||
@@ -166,5 +163,13 @@ Token tokenstream_next(TokenStream* ts) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Unknown character */
|
/* Unknown character */
|
||||||
|
ts->last_info.token = (Token)-1;
|
||||||
|
ts->last_info.text = NULL;
|
||||||
|
ts->last_info.text_length = 0;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void tokenstream_info(TokenStream* ts, TokenInfo* info) {
|
||||||
|
if (ts == NULL || info == NULL) return;
|
||||||
|
*info = ts->last_info;
|
||||||
|
}
|
||||||
|
|||||||
+14
-6
@@ -4,7 +4,7 @@
|
|||||||
#ifndef TOKEN_H
|
#ifndef TOKEN_H
|
||||||
#define TOKEN_H
|
#define TOKEN_H
|
||||||
|
|
||||||
#include "buffer.h"
|
#include <stddef.h>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A list of all possible tokens.
|
* A list of all possible tokens.
|
||||||
@@ -34,6 +34,7 @@ typedef enum {
|
|||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/// @brief The textual representation of a token.
|
/// @brief The textual representation of a token.
|
||||||
|
/// Note that this is not necessarily null-terminated.
|
||||||
char* text;
|
char* text;
|
||||||
|
|
||||||
/// @brief The length of the `text` string.
|
/// @brief The length of the `text` string.
|
||||||
@@ -46,14 +47,12 @@ typedef struct {
|
|||||||
typedef struct TokenStream TokenStream;
|
typedef struct TokenStream TokenStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a TokenStream for a given buffer.
|
* Returns a TokenStream for a text.
|
||||||
*
|
*
|
||||||
* When the tokenstream is closed, the underlying buffer is also closed.
|
* @param code The text to read.
|
||||||
*
|
|
||||||
* @param buffer The buffer to read from.
|
|
||||||
* @returns A handle to the TokenStream.
|
* @returns A handle to the TokenStream.
|
||||||
*/
|
*/
|
||||||
TokenStream* tokenstream_open(Buffer* buffer);
|
TokenStream* tokenstream_open(const char* code);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Closes a TokenStream.
|
* Closes a TokenStream.
|
||||||
@@ -68,4 +67,13 @@ void tokenstream_close(TokenStream* ts);
|
|||||||
*/
|
*/
|
||||||
Token tokenstream_next(TokenStream* ts);
|
Token tokenstream_next(TokenStream* ts);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets additional information about the last token that was returned
|
||||||
|
* by `tokenstream_next`.
|
||||||
|
*
|
||||||
|
* @param ts The TokenStream to use.
|
||||||
|
* @param info The TokenInfo object to store the results in.
|
||||||
|
*/
|
||||||
|
void tokenstream_info(TokenStream* ts, TokenInfo* info);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
Reference in New Issue
Block a user