jsfw/ser/lexer.c

373 lines
10 KiB
C
Raw Normal View History

2023-05-06 11:42:50 -05:00
#include "lexer.h"
#include "vector.h"
#include <stdbool.h>
#include <string.h>
typedef struct {
uint32_t start;
uint32_t current;
Source *src;
Location loc;
Location start_loc;
TokenVec tokens;
LexingErrorVec errors;
} Lexer;
static inline __attribute__((always_inline)) Token
token(Source *src, TokenType type, const char *lexeme, uint32_t len, uint64_t lit, Location loc) {
IF_DEBUG(src->ref_count++);
return (Token){
.src = src,
.lit = lit,
.span.loc = loc,
.span.len = len,
.type = type,
.lexeme = lexeme,
};
}
static inline __attribute__((always_inline)) LexingError
lexing_error(Source *src, LexingErrorType type, Location loc, uint32_t len) {
IF_DEBUG(src->ref_count++);
return (LexingError){
.src = src,
.type = type,
.span.loc = loc,
.span.len = len,
};
}
void token_drop(Token t) { IF_DEBUG(t.src->ref_count--); }
void lexing_error_drop(LexingError e) { IF_DEBUG(e.src->ref_count--); }
void lexing_result_drop(LexingResult res) {
vec_drop(res.tokens);
vec_drop(res.errors);
}
static Lexer lexer_init(Source *src) {
TokenVec tokens = vec_init();
vec_grow(&tokens, 256);
return (Lexer){
.start = 0,
.current = 0,
.src = src,
.loc = location(1, 0, 0),
.start_loc = location(1, 1, 0),
.tokens = tokens,
.errors = vec_init(),
};
}
static void lexer_add_token(Lexer *lex, TokenType type, uint32_t len, uint64_t lit) {
vec_push(&lex->tokens, token(lex->src, type, &lex->src->str[lex->start], len, lit, lex->start_loc));
}
static void lexer_add_error(Lexer *lex, LexingErrorType type, uint32_t len) {
vec_push(&lex->errors, lexing_error(lex->src, type, lex->start_loc, len));
}
static char lexer_advance(Lexer *lex) {
char c = lex->src->str[lex->current++];
lex->loc.offset = lex->current;
lex->loc.column++;
if (c == '\n') {
lex->loc.line++;
lex->loc.column = 0;
}
return c;
}
static bool lexer_match(Lexer *lex, char exp) {
if (lex->current >= lex->src->len)
return false;
if (lex->src->str[lex->current] != exp)
return false;
lexer_advance(lex);
return true;
}
static bool lexer_match_not(Lexer *lex, char unexp) {
if (lex->current >= lex->src->len)
return false;
if (lex->src->str[lex->current] == unexp)
return false;
lexer_advance(lex);
return true;
}
static char lexer_peek(Lexer *lex) { return lex->src->str[lex->current]; }
inline static bool is_digit(char c) { return c >= '0' && c <= '9'; }
inline static uint64_t to_digit(char c) { return c - '0'; }
inline static bool is_ident_start(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); }
inline static bool is_ident(char c) { return is_ident_start(c) || is_digit(c) || c == '_'; }
static void lexer_scan_number(Lexer *lex) {
// Get first digit (the one we already passed)
uint64_t lit = to_digit(lex->src->str[lex->start]);
uint32_t len = 1;
char c = lexer_peek(lex);
bool overflow = false;
while (is_digit(c)) {
uint64_t nlit = lit * 10 + to_digit(c);
if (nlit < lit) { // overflow
overflow = true;
}
lit = nlit;
lexer_advance(lex);
c = lexer_peek(lex);
len++;
}
if (overflow) {
lexer_add_error(lex, LexingErrorNumberLiteralOverflow, len);
}
lexer_add_token(lex, Number, len, lit);
}
static uint32_t u32max(uint32_t a, uint32_t b) { return a > b ? a : b; }
static inline __attribute__((always_inline)) void lexer_scan_ident(Lexer *lex) {
uint32_t len = 1;
while (is_ident(lexer_peek(lex))) {
lexer_advance(lex);
len++;
}
const char *s = &lex->src->str[lex->start];
#define handle(x, str) else if (strncmp(str, s, u32max(sizeof(str) - 1, len)) == 0) lexer_add_token(lex, x, len, 0)
if (false)
;
handle(Messages, "messages");
handle(Struct, "struct");
handle(Version, "version");
handle(Const, "const");
handle(Type, "type");
else lexer_add_token(lex, Ident, len, 0);
#undef handle
}
static void lexer_scan(Lexer *lex) {
char c = lexer_advance(lex);
switch (c) {
case '(':
lexer_add_token(lex, LeftParen, 1, 0);
break;
case ')':
lexer_add_token(lex, RightParen, 1, 0);
break;
case '{':
lexer_add_token(lex, LeftBrace, 1, 0);
break;
case '}':
lexer_add_token(lex, RightBrace, 1, 0);
break;
case '[':
lexer_add_token(lex, LeftBracket, 1, 0);
break;
case ']':
lexer_add_token(lex, RightBracket, 1, 0);
break;
case ',':
lexer_add_token(lex, Comma, 1, 0);
break;
case ';':
lexer_add_token(lex, Semicolon, 1, 0);
break;
case '&':
lexer_add_token(lex, Ampersand, 1, 0);
break;
case '^':
lexer_add_token(lex, Caret, 1, 0);
break;
case ':':
lexer_add_token(lex, Colon, 1, 0);
break;
case '=':
lexer_add_token(lex, Equal, 1, 0);
break;
case '#':
lexer_add_token(lex, Hash, 1, 0);
case '/':
if (lexer_match(lex, '/')) {
while (lexer_match_not(lex, '\n'))
;
}
break;
case ' ':
case '\t':
case '\n':
case '\r':
break;
default:
if (is_digit(c)) {
lexer_scan_number(lex);
} else if (is_ident_start(c)) {
lexer_scan_ident(lex);
} else {
// Try to merge with the last error if possible
if (lex->errors.len > 0) {
LexingError *last_err = &lex->errors.data[lex->errors.len - 1];
if (last_err->span.loc.line == lex->loc.line && last_err->type == LexingErrorUnexpectedCharacter &&
last_err->span.loc.column + last_err->span.len == lex->start_loc.column) {
last_err->span.len++;
break;
}
}
lexer_add_error(lex, LexingErrorUnexpectedCharacter, 1);
}
}
}
static void lexer_lex(Lexer *lex) {
while (lex->current < lex->src->len) {
lex->start = lex->current;
lex->start_loc = lex->loc;
lexer_scan(lex);
}
lex->start = lex->current;
lex->start_loc = lex->loc;
lexer_add_token(lex, Eof, 0, 0);
}
static LexingResult lexer_finish(Lexer lex) {
return (LexingResult){
.errors = lex.errors,
.tokens = lex.tokens,
};
}
LexingResult lex(Source *src) {
Lexer lex = lexer_init(src);
lexer_lex(&lex);
return lexer_finish(lex);
}
void lexing_error_report(LexingError *le) {
ReportSpan span = {.span = le->span, .sev = ReportSeverityError};
#define report(fmt, ...) source_report(le->src, le->span.loc, ReportSeverityError, &span, 1, NULL, fmt, __VA_ARGS__);
switch (le->type) {
case LexingErrorUnexpectedCharacter:
report("Unexpected character%s '%.*s'", le->span.len > 1 ? "s" : "", le->span.len, &le->src->str[le->span.loc.offset]);
break;
case LexingErrorNumberLiteralOverflow:
report("number literal '%.*s' overflows max value of %lu", le->span.len, &le->src->str[le->span.loc.offset], UINT64_MAX);
break;
default:
break;
}
#undef report
}
char *token_type_string(TokenType t) {
TokenType types[TOKEN_TYPE_COUNT];
size_t count = 0;
#define handle(type) \
if (t & type) \
types[count++] = type
handle(LeftParen);
handle(RightParen);
handle(LeftBrace);
handle(RightBrace);
handle(LeftBracket);
handle(RightBracket);
handle(Comma);
handle(Semicolon);
handle(Ampersand);
handle(Caret);
handle(Colon);
handle(Equal);
handle(Ident);
handle(Number);
handle(Messages);
handle(Struct);
handle(Version);
handle(Const);
handle(Type);
handle(Eof);
#undef handle
CharVec str = vec_init();
for (size_t i = 0; i < count; i++) {
if (i == 0) {
} else if (i == count - 1) {
vec_push_array(&str, " or ", 4);
} else {
vec_push_array(&str, ", ", 2);
}
switch (types[i]) {
case LeftParen:
vec_push_array(&str, "'('", 3);
break;
case RightParen:
vec_push_array(&str, "')'", 3);
break;
case LeftBrace:
vec_push_array(&str, "'{'", 3);
break;
case RightBrace:
vec_push_array(&str, "'}'", 3);
break;
case LeftBracket:
vec_push_array(&str, "'['", 3);
break;
case RightBracket:
vec_push_array(&str, "']'", 3);
break;
case Comma:
vec_push_array(&str, "','", 3);
break;
case Semicolon:
vec_push_array(&str, "';'", 3);
break;
case Ampersand:
vec_push_array(&str, "'&'", 3);
break;
case Caret:
vec_push_array(&str, "'^'", 3);
break;
case Colon:
vec_push_array(&str, "':'", 3);
break;
case Equal:
vec_push_array(&str, "'='", 3);
break;
case Hash:
vec_push_array(&str, "'#'", 3);
break;
case Ident:
vec_push_array(&str, "identifier", 10);
break;
case Number:
vec_push_array(&str, "number literal", 15);
break;
case Messages:
vec_push_array(&str, "keyword messages", 16);
break;
case Struct:
vec_push_array(&str, "keyword struct", 14);
break;
case Version:
vec_push_array(&str, "keyword version", 15);
break;
case Const:
vec_push_array(&str, "keyword const", 13);
break;
case Type:
vec_push_array(&str, "keyword type", 12);
break;
case Eof:
vec_push_array(&str, "end of file", 11);
break;
}
}
vec_push(&str, '\0');
return str.data;
}